The aim of this notebook is to help user understand his/her MPESA transactions in a given time frame. What is required is an pdf upload of the mpesa statement and supplied password.

In [1]:
#install packages/modules if missing in your local machine
#pip install --upgrade plotly
#pip install streamlit

In [2]:
# Supress unnecessary warnings so that presentation looks clean
import warnings
warnings.filterwarnings('ignore')

#modules/packages required
import os

#for data manipulation/wrangling
import numpy as np
from numpy import int64
import pandas as pd

#for data visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px

#for date manipulation
import datetime as datetime
import calendar


#for pdf extraction as pdf
import tabula
from tabula.io import read_pdf


# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [3]:
#!pip install tabula-py

In [4]:
# Function to calculate missing values by column
def missing_values_table(df):
    #Total missing values 
    mis_val = df.isnull().sum()
    
    #percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    
    # Make a table with the results
    mis_val_table = pd.concat([mis_val,mis_val_percent], axis =1)
    
    # Rename the colums
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    
    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
    
    # Print some summary information
    print("Your selected dataframe has " + str(df.shape[1])+ " columns.\n"
         "There are " + str(mis_val_table_ren_columns.shape[0])+
         " columns that have missing values.")
    
    # Return the dataframe with missing information
    return mis_val_table_ren_columns

#function for checking missing values per column

#Create a new function:
def num_missing(x):
    return sum(x.isnull())


Data uploaded is done here. Requires the pdf statement and password

In [5]:
dfs = tabula.read_pdf('../data/raw_data/MPESA_Statement_20191221_to_20201221_254711170904.pdf',pages="all",multiple_tables=True,password = '27791676',stream=True, lattice=  True)

In [6]:
#check the number of tables
print("There are {0} tables in the data".format(len(dfs)))

There are 34 tables in the data


In [7]:
df = dfs[1]
df.head()

Unnamed: 0,Receipt No.,Completion Time,Details,Transaction\rStatus,Paid In,Withdrawn,Balance
0,OLL9SJFI4Z,2020-12-21 16:27:39,Withdrawal Charge,Completed,,-28.0,285.72
1,OLL9SJFI4Z,2020-12-21 16:27:39,Customer Withdrawal At Agent Till 197334 - Malezi\rmobiles ventures ltd Roysambu near Pemca Holdings\rLumumba Drive.,Completed,,-1200.0,313.72
2,OLL6SCJSFO,2020-12-21 13:56:16,Pay Bill Online to 200200 - Safaricom Post Paid Acc.\r0711170904,Completed,,-500.0,1513.72
3,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer of Funds Charge,Completed,,-41.0,2013.72
4,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer to 254711361054 - PATRICK MUUO\rKINGOO,Completed,,-1528.0,2054.72


In [8]:
# df_s = dfs[4]
# #grab the first row for the header
# new_header = df_s.iloc[0] 
# # #take the data less the header row
# df_s = df_s[1:]
# df_s.columns = new_header #set the header row as the df header
# df_s

In [9]:
# df_s = df_s.rename(columns={'Unnamed: 0': 'null_column'})
# df_s.head()
# df_s = df_s.loc[:, df_s.columns.notnull()]
# df_s.columns

In [10]:
# df.columns = df.columns.fillna('null_column')
# df.head()

In [11]:
# new_header = df.iloc[0] #grab the first row for the header
# df = df[1:] #take the data less the header row
# df.columns = new_header #set the header row as the df header
# df.shape

In [12]:
# df.head()

In [13]:
# df_new = pd.DataFrame(np.concatenate([df.values, df_s.values]), columns=df.columns)
# df_new.shape

In [14]:
# new_header = df.iloc[0] #grab the first row for the header
# df = df[1:] #take the data less the header row
# df.columns = new_header #set the header row as the df header
no_tables =[len(dfs)]
no_tables

[34]

In [15]:

no_tables = [len(dfs)]
for i in range(2, len(dfs)):
    df_s = dfs[i]
#     #grab the first row for the header
#     new_header = df_s.iloc[0] 
#     #take the data less the header row
#     df_s = df_s[1:] 
    
#     #set the header row as the df header
#     df_s.columns = new_header
    
    print(df_s.shape)
    
    ##rename null headers
    if ((df_s.columns).isna().any() == True):
        df_s.columns = df_s.columns.fillna('null_column')
    else:
        pass
    #print(df_s.columns)
    
    
    #check missing values
    print(missing_values_table(df_s))
    
    # dropping columns with > 98% missing
    missing_df = missing_values_table(df_s);
    missing_columns = list(missing_df[missing_df['% of Total Values']> 98].index)
    print('We will remove %d columns.' % len(missing_columns))
    #print(missing_columns)
    df_s.drop(missing_columns, axis =1, inplace=True)
    
    
    df = pd.DataFrame(np.concatenate([df.values, df_s.values]), columns=df.columns)
    
    
df.shape

(48, 8)
Your selected dataframe has 8 columns.
There are 3 columns that have missing values.
            Missing Values  % of Total Values
Unnamed: 0  48              100.0            
Paid In     35              72.9             
Withdrawn   13              27.1             
Your selected dataframe has 8 columns.
There are 3 columns that have missing values.
We will remove 1 columns.
(49, 8)
Your selected dataframe has 8 columns.
There are 3 columns that have missing values.
            Missing Values  % of Total Values
Unnamed: 0  49              100.0            
Paid In     40              81.6             
Withdrawn   9               18.4             
Your selected dataframe has 8 columns.
There are 3 columns that have missing values.
We will remove 1 columns.
(48, 8)
Your selected dataframe has 8 columns.
There are 3 columns that have missing values.
            Missing Values  % of Total Values
Unnamed: 0  48              100.0            
Paid In     40              83.3       

There are 3 columns that have missing values.
We will remove 1 columns.
(11, 8)
Your selected dataframe has 8 columns.
There are 3 columns that have missing values.
            Missing Values  % of Total Values
Unnamed: 0  11              100.0            
Paid In     9               81.8             
Withdrawn   2               18.2             
Your selected dataframe has 8 columns.
There are 3 columns that have missing values.
We will remove 1 columns.


(1597, 7)

In [16]:
(df.columns).isna().any()

False

In [17]:
df.head()

Unnamed: 0,Receipt No.,Completion Time,Details,Transaction\rStatus,Paid In,Withdrawn,Balance
0,OLL9SJFI4Z,2020-12-21 16:27:39,Withdrawal Charge,Completed,,-28.0,285.72
1,OLL9SJFI4Z,2020-12-21 16:27:39,Customer Withdrawal At Agent Till 197334 - Malezi\rmobiles ventures ltd Roysambu near Pemca Holdings\rLumumba Drive.,Completed,,-1200.0,313.72
2,OLL6SCJSFO,2020-12-21 13:56:16,Pay Bill Online to 200200 - Safaricom Post Paid Acc.\r0711170904,Completed,,-500.0,1513.72
3,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer of Funds Charge,Completed,,-41.0,2013.72
4,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer to 254711361054 - PATRICK MUUO\rKINGOO,Completed,,-1528.0,2054.72


In [18]:
range(2, len(dfs))

range(2, 34)

In [19]:
df.head()

Unnamed: 0,Receipt No.,Completion Time,Details,Transaction\rStatus,Paid In,Withdrawn,Balance
0,OLL9SJFI4Z,2020-12-21 16:27:39,Withdrawal Charge,Completed,,-28.0,285.72
1,OLL9SJFI4Z,2020-12-21 16:27:39,Customer Withdrawal At Agent Till 197334 - Malezi\rmobiles ventures ltd Roysambu near Pemca Holdings\rLumumba Drive.,Completed,,-1200.0,313.72
2,OLL6SCJSFO,2020-12-21 13:56:16,Pay Bill Online to 200200 - Safaricom Post Paid Acc.\r0711170904,Completed,,-500.0,1513.72
3,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer of Funds Charge,Completed,,-41.0,2013.72
4,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer to 254711361054 - PATRICK MUUO\rKINGOO,Completed,,-1528.0,2054.72


In [20]:
print(df.columns)
df.rename(columns = {'Receipt No.':'receipt_no','Completion Time':'completion_time',
                          'Details':'details','Transaction\rStatus':'status', 'Paid In':'paid_in', 'Withdrawn':'withdrawn', 'Balance':'balance'}, inplace = True) 

Index(['Receipt No.', 'Completion Time', 'Details', 'Transaction\rStatus',
       'Paid In', 'Withdrawn', 'Balance'],
      dtype='object')


In [21]:
df.head()

Unnamed: 0,receipt_no,completion_time,details,status,paid_in,withdrawn,balance
0,OLL9SJFI4Z,2020-12-21 16:27:39,Withdrawal Charge,Completed,,-28.0,285.72
1,OLL9SJFI4Z,2020-12-21 16:27:39,Customer Withdrawal At Agent Till 197334 - Malezi\rmobiles ventures ltd Roysambu near Pemca Holdings\rLumumba Drive.,Completed,,-1200.0,313.72
2,OLL6SCJSFO,2020-12-21 13:56:16,Pay Bill Online to 200200 - Safaricom Post Paid Acc.\r0711170904,Completed,,-500.0,1513.72
3,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer of Funds Charge,Completed,,-41.0,2013.72
4,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer to 254711361054 - PATRICK MUUO\rKINGOO,Completed,,-1528.0,2054.72


In [22]:
#drop row with null receipt number
#mpesa_df = mpesa.dropna(subset = ['Receipt No.'], how='all', inplace=True)
mpesa_df = df[df['receipt_no'].notna()]

In [23]:
#clean the text columns
mpesa_df['details'] = mpesa_df['details'].str.replace('\r',' ')

In [24]:
print(mpesa_df.shape)
mpesa_df.head()

(1597, 7)


Unnamed: 0,receipt_no,completion_time,details,status,paid_in,withdrawn,balance
0,OLL9SJFI4Z,2020-12-21 16:27:39,Withdrawal Charge,Completed,,-28.0,285.72
1,OLL9SJFI4Z,2020-12-21 16:27:39,Customer Withdrawal At Agent Till 197334 - Malezi mobiles ventures ltd Roysambu near Pemca Holdings Lumumba Drive.,Completed,,-1200.0,313.72
2,OLL6SCJSFO,2020-12-21 13:56:16,Pay Bill Online to 200200 - Safaricom Post Paid Acc. 0711170904,Completed,,-500.0,1513.72
3,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer of Funds Charge,Completed,,-41.0,2013.72
4,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer to 254711361054 - PATRICK MUUO KINGOO,Completed,,-1528.0,2054.72


In [25]:
missing_values_table(mpesa_df)

Your selected dataframe has 7 columns.
There are 2 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
paid_in,1295,81.1
withdrawn,302,18.9


In [26]:
#filling null values in paid_in and withdrawn columns
mpesa_df['paid_in'] = mpesa_df['paid_in'].fillna(0)
mpesa_df['withdrawn'] = mpesa_df['withdrawn'].fillna(0)
mpesa_df['balance'] = mpesa_df['balance'].fillna(0)

In [27]:
mpesa_df.head()

Unnamed: 0,receipt_no,completion_time,details,status,paid_in,withdrawn,balance
0,OLL9SJFI4Z,2020-12-21 16:27:39,Withdrawal Charge,Completed,0,-28.0,285.72
1,OLL9SJFI4Z,2020-12-21 16:27:39,Customer Withdrawal At Agent Till 197334 - Malezi mobiles ventures ltd Roysambu near Pemca Holdings Lumumba Drive.,Completed,0,-1200.0,313.72
2,OLL6SCJSFO,2020-12-21 13:56:16,Pay Bill Online to 200200 - Safaricom Post Paid Acc. 0711170904,Completed,0,-500.0,1513.72
3,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer of Funds Charge,Completed,0,-41.0,2013.72
4,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer to 254711361054 - PATRICK MUUO KINGOO,Completed,0,-1528.0,2054.72


In [28]:
#get string after dash
receiptient = []
for row in df.itertuples():
    new = row.details.split("-")
    receiptient.append(new[1] if 1 < len(new) else None)

In [29]:
mpesa_df['receiver_desc'] = receiptient

In [30]:
#clean the receiver_desc columns
mpesa_df['receiver_desc'] = mpesa_df['receiver_desc'].str.replace('\r',' ')

In [31]:
mpesa_df.head()

Unnamed: 0,receipt_no,completion_time,details,status,paid_in,withdrawn,balance,receiver_desc
0,OLL9SJFI4Z,2020-12-21 16:27:39,Withdrawal Charge,Completed,0,-28.0,285.72,
1,OLL9SJFI4Z,2020-12-21 16:27:39,Customer Withdrawal At Agent Till 197334 - Malezi mobiles ventures ltd Roysambu near Pemca Holdings Lumumba Drive.,Completed,0,-1200.0,313.72,Malezi mobiles ventures ltd Roysambu near Pemca Holdings Lumumba Drive.
2,OLL6SCJSFO,2020-12-21 13:56:16,Pay Bill Online to 200200 - Safaricom Post Paid Acc. 0711170904,Completed,0,-500.0,1513.72,Safaricom Post Paid Acc. 0711170904
3,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer of Funds Charge,Completed,0,-41.0,2013.72,
4,OLL9S703PL,2020-12-21 11:53:08,Customer Transfer to 254711361054 - PATRICK MUUO KINGOO,Completed,0,-1528.0,2054.72,PATRICK MUUO KINGOO


In [32]:
mpesa_df.to_csv("../data/notebook_outputs/clean_mpesa_transactions.csv", index = False)

In [33]:
mpesa_df.dtypes

receipt_no         object
completion_time    object
details            object
status             object
paid_in            object
withdrawn          object
balance            object
receiver_desc      object
dtype: object

In [34]:
#cleaning the numerical columns
num_col = ['paid_in','withdrawn','balance']
for col in num_col:
    mpesa_df[col] = mpesa_df[col].replace(',', '',regex=True)
    mpesa_df[col] = pd.to_numeric(mpesa_df[col])
    
mpesa_df['completion_time']= pd.to_datetime(mpesa_df['completion_time'])
mpesa_df['details'] = mpesa_df['details'].astype(str)

In [36]:
#extract month transaction
mpesa_df['year'] = mpesa_df['completion_time'].dt.year
mpesa_df['month'] = mpesa_df['completion_time'].dt.month
mpesa_df['month'] = mpesa_df['month'].apply(lambda x: calendar.month_name[x])
mpesa_df['quarter'] = mpesa_df['completion_time'].dt.quarter

mpesa_df['transactions_cohort']= mpesa_df['year'].astype(str) + "_" + mpesa_df['month'].astype(str)

In [37]:
#sorting df by date
mpesa_df=mpesa_df.sort_values(by=['completion_time'],ascending =True)
mpesa_df.head()

Unnamed: 0,receipt_no,completion_time,details,status,paid_in,withdrawn,balance,receiver_desc,year,month,quarter,transactions_cohort
1596,NLL6TDOVMM,2019-12-21 07:07:14,Airtime Purchase,Completed,0.0,-100.0,31000.46,,2019,December,4,2019_December
1595,NLL1TEOQH1,2019-12-21 08:05:26,Customer Withdrawal At Agent Till 233409 - Natsars Ltd Zimmerman Area Behind Co-op Bank,Completed,0.0,-15000.0,16000.46,Natsars Ltd Zimmerman Area Behind Co,2019,December,4,2019_December
1594,NLL1TEOQH1,2019-12-21 08:05:26,Withdrawal Charge,Completed,0.0,-162.0,15838.46,,2019,December,4,2019_December
1593,NLL7U6LEUV,2019-12-21 22:19:36,Customer Transfer to 254724775068 - CALEB CHERUIYOT BIEGON,Completed,0.0,-2028.0,13810.46,CALEB CHERUIYOT BIEGON,2019,December,4,2019_December
1592,NLL7U6LEUV,2019-12-21 22:19:36,Customer Transfer of Funds Charge,Completed,0.0,-41.0,13769.46,,2019,December,4,2019_December


In [38]:
#group sum
#mpesa_df['pay_bill_charges'] = np.where((mpesa_df['details'].str.contains('Pay Bill Charge')),'paybill', 'Non_paybill')

In [39]:
#mpesa_df['customer_transfers'] = np.where((mpesa_df['details'].str.contains('Customer Transfer to')),'customer_transfer', 'Non_customer_transfer')

In [40]:
mpesa_df['withdrawn'] = abs(mpesa_df['withdrawn'])

In [41]:
mpesa_df.head()

Unnamed: 0,receipt_no,completion_time,details,status,paid_in,withdrawn,balance,receiver_desc,year,month,quarter,transactions_cohort
1596,NLL6TDOVMM,2019-12-21 07:07:14,Airtime Purchase,Completed,0.0,100.0,31000.46,,2019,December,4,2019_December
1595,NLL1TEOQH1,2019-12-21 08:05:26,Customer Withdrawal At Agent Till 233409 - Natsars Ltd Zimmerman Area Behind Co-op Bank,Completed,0.0,15000.0,16000.46,Natsars Ltd Zimmerman Area Behind Co,2019,December,4,2019_December
1594,NLL1TEOQH1,2019-12-21 08:05:26,Withdrawal Charge,Completed,0.0,162.0,15838.46,,2019,December,4,2019_December
1593,NLL7U6LEUV,2019-12-21 22:19:36,Customer Transfer to 254724775068 - CALEB CHERUIYOT BIEGON,Completed,0.0,2028.0,13810.46,CALEB CHERUIYOT BIEGON,2019,December,4,2019_December
1592,NLL7U6LEUV,2019-12-21 22:19:36,Customer Transfer of Funds Charge,Completed,0.0,41.0,13769.46,,2019,December,4,2019_December


In [43]:
text_group = []
for row in mpesa_df.itertuples():
    if 'Funds Charge' in row.details:
        text_group.append('Funds Charges')
    elif 'Business Payment from' in row.details:
        text_group.append('Business Payments')
    elif 'Loan Repayment' in row.details:
        text_group.append('Loan Repayment')
    elif 'Receive International Transfer From' in row.details:
        text_group.append('International Funds')
    elif 'Airtime' in row.details:
        text_group.append('Airtime')
    elif 'Customer Transfer to' in row.details:
        text_group.append('Customer Transfer')
    elif 'Customer Transfer Fuliza' in row.details:
        text_group.append('Fuliza')   
    elif 'Customer Withdrawal At' in row.details:
        text_group.append('Customer Withdrawals')
    elif 'Withdrawal Charge' in row.details: 
        text_group.append('Withdrawal Charges')
    elif 'Buy Bundles' in row.details: 
        text_group.append('Buy Bundles')
    elif 'Pay Bill' in row.details:
        text_group.append('Pay Bills')
    elif 'Pay Bill Charge' in row.details:
        text_group.append('Pay Bill Charges')
    elif 'Merchant Payment' in row.details: 
        text_group.append('Merchant Payments')
    elif 'Funds received from' in row.details: 
        text_group.append('Funds Received')
    elif 'OverDraft' in row.details: 
        text_group.append('Overdraft')
    elif 'Promotion Payment from' in row.details: 
        text_group.append('Promotion Payments')
    elif 'Deposit of Funds at ' in row.details: 
        text_group.append('Funds Deposits')
    elif 'M-Shwari Deposit' in row.details: 
        text_group.append('M-Shwari Deposit')
    elif 'M-Shwari Withdraw' in row.details: 
        text_group.append('M-Shwari Withdraws')
    elif 'Pay Merchant Charge' in row.details: 
        text_group.append('Mechant Pay Charges')
    elif 'Reversal' in row.details: 
        text_group.append('Reversals')
    elif 'M-Shwari Lock Deposit' in row.details: 
        text_group.append('M-Shwari Deposits')
    elif 'M-Shwari Loan Disburse' in row.details: 
        text_group.append('M-Shwari Loan')
    else :
        text_group.append('error')
        
mpesa_df['transactions_group'] = text_group
        

In [44]:
mpesa_df[mpesa_df['transactions_group']=='error']

Unnamed: 0,receipt_no,completion_time,details,status,paid_in,withdrawn,balance,receiver_desc,year,month,quarter,transactions_cohort,transactions_group


In [45]:
mpesa_df.head()

Unnamed: 0,receipt_no,completion_time,details,status,paid_in,withdrawn,balance,receiver_desc,year,month,quarter,transactions_cohort,transactions_group
1596,NLL6TDOVMM,2019-12-21 07:07:14,Airtime Purchase,Completed,0.0,100.0,31000.46,,2019,December,4,2019_December,Airtime
1595,NLL1TEOQH1,2019-12-21 08:05:26,Customer Withdrawal At Agent Till 233409 - Natsars Ltd Zimmerman Area Behind Co-op Bank,Completed,0.0,15000.0,16000.46,Natsars Ltd Zimmerman Area Behind Co,2019,December,4,2019_December,Customer Withdrawals
1594,NLL1TEOQH1,2019-12-21 08:05:26,Withdrawal Charge,Completed,0.0,162.0,15838.46,,2019,December,4,2019_December,Withdrawal Charges
1593,NLL7U6LEUV,2019-12-21 22:19:36,Customer Transfer to 254724775068 - CALEB CHERUIYOT BIEGON,Completed,0.0,2028.0,13810.46,CALEB CHERUIYOT BIEGON,2019,December,4,2019_December,Customer Transfer
1592,NLL7U6LEUV,2019-12-21 22:19:36,Customer Transfer of Funds Charge,Completed,0.0,41.0,13769.46,,2019,December,4,2019_December,Funds Charges


In [None]:
#undersanding the performance of various transactions groups over time
table_withdrawals = pd.pivot_table(mpesa_df,index=['transactions_group'],columns = ['transactions_cohort'],
                       values = ['withdrawn'],aggfunc={'withdrawn':np.sum},
                      margins=True,margins_name='Grand Total').reset_index()


table_withdrawals.columns = [' '.join(col).strip() for col in table_withdrawals.columns.values]
table_withdrawals = table_withdrawals.sort_values('transactions_group', ascending=False)
table_withdrawals.columns = table_withdrawals.columns.str.replace("withdrawn ", "")
table_withdrawals = table_withdrawals[table_withdrawals['Grand Total'] !=0].fillna(0)

# #columns ordering
#table_withdrawals = table_withdrawals[['transactions_group','March', 'April', 'May', 'June', 'July', 'August','Grand Total']]

table_withdrawals

In [None]:
#undersanding the performance of various transactions groups over time
table_deposits = pd.pivot_table(mpesa_df,index=['transactions_group'],columns = ['transactions_cohort'],
                       values = ['paid_in'],aggfunc={'paid_in':np.sum},
                      margins=True,margins_name='Grand Total').reset_index()


table_deposits.columns = [' '.join(col).strip() for col in table_deposits.columns.values]
table_deposits = table_deposits.sort_values('transactions_group', ascending=False)
table_deposits.columns = table_deposits.columns.str.replace("paid_in ", "")
table_deposits = table_deposits[table_deposits['Grand Total'] !=0].fillna(0)

#columns reordering
#table_deposits = table_deposits[['transactions_group','March', 'April', 'May', 'June', 'July', 'August','Grand Total']]


table_deposits

In [None]:
# Group the data frame by month and item and extract a number of stats from each group
mpesa_agg =mpesa_df.groupby(['transactions_cohort','transactions_group'], as_index= False).agg({
        # Find the min, max, and sum of the duration column
        'withdrawn': ["count", sum],
        # find the number of network type entries
        'paid_in': [sum]
    }
)

#mpesa_agg.set_index('transactions_group',inplace= True)
#mpesa_agg = mpesa_agg.reset_index()
mpesa_agg.columns = [' '.join(col).strip() for col in mpesa_agg.columns.values]
#mpesa_agg.loc['Total']= mpesa_agg.sum(numeric_only=True, axis=0)
mpesa_agg = mpesa_agg.where(pd.notnull(mpesa_agg), None)
mpesa_agg


In [None]:
fig = px.treemap(mpesa_agg, path=['transactions_cohort', 'transactions_group'], values='withdrawn count')
#fig = px.histogram(mpesa_agg, x='withdrawn count',color = "month", title='Rating distribution')
fig.show()

In [None]:

fig =px.treemap(mpesa_agg, path=['transactions_cohort', 'transactions_group'], values='withdrawn count',
                 color='withdrawn sum',color_continuous_scale='RdBu',
                  color_continuous_midpoint=np.average(mpesa_agg['withdrawn sum'], weights=mpesa_agg['withdrawn count']))
# this is what I don't like, accessing traces like this
fig.data[0].textinfo = 'label+text+value'

#fig.layout.hovertamplate = '%{label}<br>%{value}'
fig.data[0].hovertemplate = '%{label}<br>%{value}'
fig.show()

In [None]:
fig =px.treemap(mpesa_agg, path=['transactions_cohort', 'transactions_group'], values='withdrawn sum')
fig.data[0].textinfo = 'label+text+value'

#fig.layout.hovermode = False
fig.data[0].hovertemplate = '%{label}<br>%{value}'
fig.show()

In [None]:
fig = px.sunburst(mpesa_agg, path=['transactions_cohort', 'transactions_group'], values='withdrawn count', title='Monthly usage')
#fig = px.histogram(mpesa_agg, x='withdrawn count',color = "month", title='Rating distribution')
fig.show()

In [None]:
mpesa_df.to_csv("../data/notebook_outputs/aggregated_mpesa_charges.csv",index= False)

In [None]:
mpesa_df.head()