### Task

1. Dump the given .sql files (each file represents a single table) into your MySQL database.
2. Explore the provided tables and perform some level of EDA (either using Python libraries or using SQL query) to get insights about the data.*
3. Perform monthly and weekly level product wise aggregations of the transactions for each user and maintain both in separate tables.
4. Calculate all the facts required for customer profile table using any convenient tool (Python or SQL or Spark) utilizing the aggregate table created in step 3.
5. Create an Airflow DAG that will execute the script or code for steps 3 and 4 sequentially which can be scheduled as per the requirement (daily or hourly).


In [1]:
import pandas as pd

from mysql_connector.mysql_connection import table_df

### Loading Tables

In [2]:
customer_profile = table_df('customer', 'customer_profile')

rw_transaction_data = table_df('customer', 'rw_transaction_data')
products = table_df('customer', 'products')
product_category = table_df('customer', 'product_category')
product_category_map = table_df('customer', 'product_category_map')

### Customer Profile

In [3]:
customer_profile.head()

Unnamed: 0,account_id,reward_point,total_inflow_amount,total_outflow_amount,total_valuechain_amount,total_inflow_count,total_outflow_count,total_valuechain_count,monthly_inflow_amount,monthly_outflow_amount,...,latest_used_product,latest_tran_date,this_month_revenue,monthly_average_lifetime_revenue,total_revenue,product_usage,most_used_product,second_most_used_product,third_most_used_product,run_date
0,1,1574.0,76385.0,70045.0,7650.0,39,31,4,76385.0,70045.0,...,Send Money,2081-01-26,31.0,31.0,31.0,14,Send Money,Fonepay Payment,eSewa to Laxmi Sunrise Bank Ltd.,2024-01-05
1,2,306.0,557075.0,184405.0,344798.0,753,142,400,16005.0,6180.0,...,Fonepay Payment,2081-01-31,4.0,62.0,2727.0,47,NT Prepaid Topup,Send Money,Ncell Topup,2024-01-05
2,3,10.0,18415.0,12400.0,5110.0,6,5,9,18415.0,12400.0,...,NT Prepaid Topup,2081-01-31,16.0,16.0,16.0,8,Ncell Topup,Send Money,Fonepay Payment,2024-01-05
3,4,324.0,6350.0,3050.0,7499.0,4,3,8,6350.0,3050.0,...,Fonepay Payment,2081-01-21,21.0,21.0,21.0,8,Fonepay Payment,Send Money,Electricity,2024-01-05
4,5,929.0,168576.0,100385.0,53911.0,69,24,95,106106.0,88660.0,...,Fonepay Payment,2081-01-31,70.0,28.0,1226.0,37,NT Prepaid Topup,NT Postpaid Topup,Send Money,2024-01-05


### Transaction Data

In [4]:
rw_transaction_data.head()

Unnamed: 0,txn_id,last_modified_date,last_modified_date_bs,created_date,amount,status,module_id,product_id,product_type_id,payer_account_id,receiver_account_id,reward_point,cash_back_amount,revenue_amount,transactor_module_id,time
0,660612529,2023-03-29,2079-12-15,2022-11-23,50.0,1,1,77,29,531,2,0.0,0.0,0.0,4,14:07:40
1,666435422,2022-12-01,2079-08-15,2022-12-01,750.0,1,1,76,29,531,81,0.0,0.0,33.75,4,00:03:41
2,666436001,2022-12-01,2079-08-15,2022-12-01,50.0,1,1,77,29,531,2,0.0,1.38,0.0,4,00:06:20
3,666436068,2022-12-01,2079-08-15,2022-12-01,50.0,1,1,77,29,531,647,0.0,1.38,0.0,4,00:06:32
4,666437220,2022-12-01,2079-08-15,2022-12-01,100.0,1,1,30,29,531,907,0.0,3.9,0.0,4,00:12:51


In [5]:
rw_transaction_data.shape

(193367, 16)

### Product and Product Category

In [6]:
products.head()

Unnamed: 0,module_id,product_id,product_type_id,product_name,product_type_name
0,1,77,29,Ncell Topup via Bank,Card Server Payment
1,1,76,29,SIM TV TOPUP VIA BANK,Card Server Payment
2,1,30,29,NT Topup via Bank,Card Server Payment
3,1,133,29,Wordlink Topup via Bank,Card Server Payment
4,1,89,29,Smart Cell Topup via Bank,Card Server Payment


In [7]:
product_category.head()

Unnamed: 0,id,category_name,description,parent_id,status,active_flag,created_on,updated_on
0,1,Telecommunications,Telecommunications,,0,1,2024-04-21 09:42:39,
1,2,Utility,Utility,,0,1,2024-04-21 09:42:39,
2,3,TV-Internet-Movies,TV-Internet-Movies,,0,1,2024-04-21 09:42:39,
3,4,Travel and Airline Services,Travel and Airline Services,,0,1,2024-04-21 09:42:39,
4,5,Ride Sharing,Ride Sharing,,0,1,2024-04-21 09:42:39,


In [8]:
product_category.drop('updated_on', inplace=True, axis=1)

In [9]:
product_category.rename(columns={'id': 'product_category_id'}, inplace=True)

In [10]:
product_category_map.head()

Unnamed: 0,module_id,product_id,product_type_id,product_name,product_category_id,txn_flow
0,1,670,11,"Siddhartha Sishu Sadan, Ln Chowk, Dhangadhi",33.0,Value Chain
1,2,2545,3,Multipurpose Finance Transfer,,OutFlow
2,2,1444,6,Century Corporate Transfer,,OutFlow
3,2,1008,7,Srijana Corporate Fund Receiver,,OutFlow
4,2,885,10,Pokhara_Convergence,,OutFlow


#### Cleaning product_category_map

In [11]:
product_category_map[product_category_map.duplicated(subset =['product_id', 'module_id', 'product_type_id'])]

Unnamed: 0,module_id,product_id,product_type_id,product_name,product_category_id,txn_flow
16,1,77,29,Ncell Topup via Bank,14.0,Value Chain
17,1,4,25,Ncell Topup,14.0,Value Chain
23,1,21,24,NT Postpaid Topup,14.0,Value Chain
25,1,19,35,NT Prepaid Topup,14.0,Value Chain
62,1,218,37,Ncell Pack,14.0,Value Chain
63,1,287,29,Ncell Data Via Bank,14.0,Value Chain
94,1,325,80,eScrow Service,92.0,OutFlow
95,1,8,12,Epay Donation,90.0,OutFlow
96,1,2,12,Send Money,90.0,OutFlow
97,1,19,12,OLDBALANCE,90.0,OutFlow


In [12]:
product_category_map[(product_category_map['module_id'] == 1) & 
                     (product_category_map['product_id'] == 19) & 
                     (product_category_map['product_type_id'] == 35)]

Unnamed: 0,module_id,product_id,product_type_id,product_name,product_category_id,txn_flow
7,1,19,35,Prepaid Topup,,Value Chain
25,1,19,35,NT Prepaid Topup,14.0,Value Chain


In [13]:
product_category_map[(product_category_map['module_id'] == 1) & 
                     (product_category_map['product_id'] == 325) & 
                     (product_category_map['product_type_id'] == 80)]

Unnamed: 0,module_id,product_id,product_type_id,product_name,product_category_id,txn_flow
87,1,325,80,eScrow Service,92.0,InFlow
94,1,325,80,eScrow Service,92.0,OutFlow


In [14]:
product_category_map = product_category_map.loc[:, ['product_id', 'module_id', 'product_type_id', 'txn_flow']]

In [15]:
product_category_map.duplicated().sum()

6

In [16]:
product_category_map.drop_duplicates(inplace=True)

### Complex Join 

- Join of all tables

In [17]:
merged_df_1 = pd.merge(rw_transaction_data, products, on = ['product_id', 'product_type_id', 'module_id'], how = 'inner')
merged_df_1.head()

Unnamed: 0,txn_id,last_modified_date,last_modified_date_bs,created_date,amount,status,module_id,product_id,product_type_id,payer_account_id,receiver_account_id,reward_point,cash_back_amount,revenue_amount,transactor_module_id,time,product_name,product_type_name
0,660612529,2023-03-29,2079-12-15,2022-11-23,50.0,1,1,77,29,531,2,0.0,0.0,0.0,4,14:07:40,Ncell Topup via Bank,Card Server Payment
1,666435422,2022-12-01,2079-08-15,2022-12-01,750.0,1,1,76,29,531,81,0.0,0.0,33.75,4,00:03:41,SIM TV TOPUP VIA BANK,Card Server Payment
2,666436001,2022-12-01,2079-08-15,2022-12-01,50.0,1,1,77,29,531,2,0.0,1.38,0.0,4,00:06:20,Ncell Topup via Bank,Card Server Payment
3,666436068,2022-12-01,2079-08-15,2022-12-01,50.0,1,1,77,29,531,647,0.0,1.38,0.0,4,00:06:32,Ncell Topup via Bank,Card Server Payment
4,666437220,2022-12-01,2079-08-15,2022-12-01,100.0,1,1,30,29,531,907,0.0,3.9,0.0,4,00:12:51,NT Topup via Bank,Card Server Payment


In [18]:
merged_df_1.shape

(193367, 18)

In [19]:
merged_df_1.nunique()

txn_id                   193367
last_modified_date          144
last_modified_date_bs       144
created_date                145
amount                     6343
status                        1
module_id                     2
product_id                  399
product_type_id              33
payer_account_id              9
receiver_account_id        1621
reward_point                 33
cash_back_amount            823
revenue_amount              467
transactor_module_id          2
time                      57786
product_name                386
product_type_name            35
dtype: int64

In [20]:
merged_df_2 = pd.merge(merged_df_1, product_category_map, on = ['product_id', 'product_type_id', 'module_id'], how='inner')

In [21]:
merged_df_2.head()

Unnamed: 0,txn_id,last_modified_date,last_modified_date_bs,created_date,amount,status,module_id,product_id,product_type_id,payer_account_id,receiver_account_id,reward_point,cash_back_amount,revenue_amount,transactor_module_id,time,product_name,product_type_name,txn_flow
0,660612529,2023-03-29,2079-12-15,2022-11-23,50.0,1,1,77,29,531,2,0.0,0.0,0.0,4,14:07:40,Ncell Topup via Bank,Card Server Payment,Value Chain
1,666435422,2022-12-01,2079-08-15,2022-12-01,750.0,1,1,76,29,531,81,0.0,0.0,33.75,4,00:03:41,SIM TV TOPUP VIA BANK,Card Server Payment,Value Chain
2,666436001,2022-12-01,2079-08-15,2022-12-01,50.0,1,1,77,29,531,2,0.0,1.38,0.0,4,00:06:20,Ncell Topup via Bank,Card Server Payment,Value Chain
3,666436068,2022-12-01,2079-08-15,2022-12-01,50.0,1,1,77,29,531,647,0.0,1.38,0.0,4,00:06:32,Ncell Topup via Bank,Card Server Payment,Value Chain
4,666437220,2022-12-01,2079-08-15,2022-12-01,100.0,1,1,30,29,531,907,0.0,3.9,0.0,4,00:12:51,NT Topup via Bank,Card Server Payment,Value Chain


In [22]:
# merged_df_2.rename(columns={'product_name_x': 'product_name'}, inplace=True)

In [23]:
merged_df_2.shape

(151729, 19)

In [24]:
merged_df_2.nunique()

txn_id                   147309
last_modified_date          144
last_modified_date_bs       144
created_date                145
amount                     2545
status                        1
module_id                     2
product_id                  155
product_type_id              24
payer_account_id              9
receiver_account_id        1409
reward_point                 28
cash_back_amount            791
revenue_amount              439
transactor_module_id          2
time                      55637
product_name                155
product_type_name            26
txn_flow                      3
dtype: int64

In [26]:
# merged_df_3 = pd.merge(merged_df_2, product_category, on = ['product_category_id'], how='left')
# merged_df_3.shape

In [27]:
df = merged_df_2.copy()

In [28]:
df.shape

(151729, 19)

### ID to String

In [29]:
df[['txn_id', 'module_id', 'product_id', 'product_type_id', 'payer_account_id', 'receiver_account_id', 'transactor_module_id']] = \
    df[['txn_id', 'module_id', 'product_id', 'product_type_id', 'payer_account_id', 'receiver_account_id', 'transactor_module_id']].apply(lambda x: x.astype(str))

### Date 

In [30]:
df['last_modified_date'] = df['last_modified_date'].astype(str) 

In [31]:
df['dates'] = pd.to_datetime(df['last_modified_date'] + df['time'], format='%Y-%m-%d%H:%M:%S')

In [32]:
df[['last_modified_date', 'time', 'dates']]

Unnamed: 0,last_modified_date,time,dates
0,2023-03-29,14:07:40,2023-03-29 14:07:40
1,2022-12-01,00:03:41,2022-12-01 00:03:41
2,2022-12-01,00:06:20,2022-12-01 00:06:20
3,2022-12-01,00:06:32,2022-12-01 00:06:32
4,2022-12-01,00:12:51,2022-12-01 00:12:51
...,...,...,...
151724,2023-05-24,23:18:08,2023-05-24 23:18:08
151725,2023-05-24,23:21:16,2023-05-24 23:21:16
151726,2023-05-24,23:30:13,2023-05-24 23:30:13
151727,2023-05-24,23:47:34,2023-05-24 23:47:34


In [33]:

# df['last_modified_date'] = pd.to_datetime(df['last_modified_date'], errors = 'coerce') 
df[['last_modified_date', 'last_modified_date_bs', 'created_date']] = df[['last_modified_date', 'last_modified_date_bs', 'created_date']].apply(pd.to_datetime, errors='coerce')
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.time

# Creating monthly and weekly columns
df['monthly'] = df['last_modified_date'].dt.month
df['weekly'] = df['last_modified_date'].dt.isocalendar().week


### TXN Flow

In [34]:
pivot_monthly = pd.pivot_table(df, index = ['payer_account_id', 'monthly'], columns = 'txn_flow', values = 'amount', aggfunc = 'sum').reset_index()
pivot_monthly

txn_flow,payer_account_id,monthly,InFlow,OutFlow,Value Chain
0,1056,1,755.0,755.0,325.0
1,1056,2,5500.0,5500.0,674.0
2,1056,3,,,1039.0
3,1056,4,,3000.0,855.0
4,1056,5,,,1554.0
5,1056,12,,,200.0
6,1176,3,,,10.0
7,222,1,170.0,170.0,1045.0
8,222,2,250.0,250.0,1535.0
9,222,3,,,350.0


In [35]:
aggregrated_df_monthly = (
    pivot_monthly
    .groupby('payer_account_id')
    .agg(
        total_inflow_amount = ('InFlow', 'sum'),
        total_outflow_amount = ('OutFlow', 'sum'),
        total_valuechain_amount = ('Value Chain', 'sum'),
        monthly_inflow_amount = ('InFlow', 'mean'),
        monthly_outflow_amount = ('OutFlow', 'mean'),
        monthly_valuechain_amount = ('Value Chain', 'mean'),
    )
    .reset_index()
)
aggregrated_df_monthly

Unnamed: 0,payer_account_id,total_inflow_amount,total_outflow_amount,total_valuechain_amount,monthly_inflow_amount,monthly_outflow_amount,monthly_valuechain_amount
0,1056,6255.0,9255.0,4647.0,3127.5,3085.0,774.5
1,1176,0.0,0.0,10.0,,,10.0
2,222,420.0,3070.0,7635.0,210.0,767.5,1272.5
3,26,1437011.0,2997891.0,757008.0,287402.2,599578.2,126168.0
4,34,62469919.0,65480803.0,4086311.0,10411650.0,10913470.0,681051.8
5,471,54320.0,55320.0,272075.0,10864.0,11064.0,45345.83
6,531,0.0,0.0,31743311.0,,,5290552.0
7,538,0.0,0.0,10.0,,,10.0
8,56,590322.0,4009942.0,1637844.0,98387.0,668323.7,272974.0


In [36]:
pivot_monthly_count = pd.pivot_table(df, index = ['payer_account_id', 'monthly'], columns = 'txn_flow', values = 'amount', aggfunc = 'count').reset_index()
pivot_monthly_count 

txn_flow,payer_account_id,monthly,InFlow,OutFlow,Value Chain
0,1056,1,1.0,1.0,4.0
1,1056,2,1.0,1.0,7.0
2,1056,3,,,8.0
3,1056,4,,1.0,4.0
4,1056,5,,,10.0
5,1056,12,,,1.0
6,1176,3,,,1.0
7,222,1,1.0,1.0,10.0
8,222,2,1.0,1.0,11.0
9,222,3,,,3.0


In [37]:
aggregrated_df_monthly_count = (
    pivot_monthly_count
    .groupby('payer_account_id')
    .agg(
        total_inflow_count = ('InFlow', 'sum'),
        total_outflow_count = ('OutFlow', 'sum'),
        total_valuechain_count = ('Value Chain', 'sum'),
        monthly_inflow_count = ('InFlow', 'mean'),
        monthly_outflow_count = ('OutFlow', 'mean'),
        monthly_valuechain_count = ('Value Chain', 'mean'),
    )
    .reset_index()
)

In [38]:
aggregrated_df_monthly_count

Unnamed: 0,payer_account_id,total_inflow_count,total_outflow_count,total_valuechain_count,monthly_inflow_count,monthly_outflow_count,monthly_valuechain_count
0,1056,2.0,3.0,34.0,1.0,1.0,5.666667
1,1176,0.0,0.0,1.0,,,1.0
2,222,2.0,4.0,40.0,1.0,1.0,6.666667
3,26,469.0,571.0,961.0,93.8,114.2,160.166667
4,34,3477.0,3709.0,1713.0,579.5,618.166667,285.5
5,471,65.0,66.0,1255.0,13.0,13.2,209.166667
6,531,0.0,0.0,135305.0,,,22550.833333
7,538,0.0,0.0,1.0,,,1.0
8,56,405.0,554.0,3092.0,67.5,92.333333,515.333333


### Reward Points

In [39]:
reward_agg = df.groupby('payer_account_id').agg({'reward_point': 'sum'})
reward_agg

Unnamed: 0_level_0,reward_point
payer_account_id,Unnamed: 1_level_1
1056,27.0
1176,0.0
222,11.0
26,207.0
34,2639.0
471,180.0
531,0.0
538,0.0
56,805.0


### Lastest Transaction Date and Used Product

In [40]:
idx_max_date = df.groupby('payer_account_id')['dates'].idxmax()
result_df = df.loc[idx_max_date, ['payer_account_id', 'product_name', 'dates']]
result_df.rename(columns = {'product_name': 'latest_used_product', 'dates': 'latest_tran_date'}, inplace=True)
result_df


Unnamed: 0,payer_account_id,latest_used_product,latest_tran_date
145181,1056,NT Prepaid Topup,2023-05-21 19:34:59
38687,1176,Ncell Topup,2023-03-04 15:19:19
131529,222,NT Prepaid Topup,2023-05-16 14:58:00
151538,26,WorldLink,2023-05-24 20:46:04
151424,34,Send Money,2023-05-24 19:53:52
151230,471,Ncell Topup,2023-05-24 18:30:54
151728,531,Ncell Topup via Bank,2023-05-24 23:59:25
138639,538,NT Prepaid Topup,2023-05-18 18:48:37
151429,56,Ncell Topup,2023-05-24 19:57:46


### Revenue

In [41]:
monthly_revenue= pd.pivot_table(df, index = ['payer_account_id', 'monthly'], values = 'revenue_amount', aggfunc = 'sum').reset_index()
monthly_revenue

Unnamed: 0,payer_account_id,monthly,revenue_amount
0,1056,1,11.37
1,1056,2,13.3
2,1056,3,16.85
3,1056,4,18.19
4,1056,5,8.94
5,1056,12,7.0
6,1176,3,0.28
7,222,1,8.36
8,222,2,10.51
9,222,3,1.95


In [42]:
revenue = (
    monthly_revenue
    .groupby('payer_account_id')
    .agg(
        monthly_average_lifetime_revenue = ('revenue_amount', 'mean'),
        total_revenue = ('revenue_amount', 'sum'),
    )
    .reset_index()
)

revenue

Unnamed: 0,payer_account_id,monthly_average_lifetime_revenue,total_revenue
0,1056,12.608333,75.65
1,1176,0.28,0.28
2,222,10.756667,64.54
3,26,255.515,1533.09
4,34,2016.175,12097.05
5,471,161.788333,970.73
6,531,19670.311667,118021.87
7,538,0.04,0.04
8,56,1077.421667,6464.53


### This Month's Revenue

In [43]:
latest_date = df['dates'].max()
latest_month_df = df[df['dates'].dt.month == latest_date.month]

this_month_revenue = latest_month_df.groupby('payer_account_id')['revenue_amount'].sum().reset_index(name = 'this_month_revenue')
this_month_revenue

Unnamed: 0,payer_account_id,this_month_revenue
0,1056,8.94
1,222,19.65
2,26,292.27
3,34,1931.19
4,471,116.16
5,531,22591.33
6,538,0.04
7,56,1430.36


### Product Usage

In [44]:
product_usage = df.groupby('payer_account_id')['product_id'].count().reset_index().rename(columns={'product_id': 'product_usage'})
product_usage

Unnamed: 0,payer_account_id,product_usage
0,1056,39
1,1176,1
2,222,46
3,26,2001
4,34,8899
5,471,1386
6,531,135305
7,538,1
8,56,4051


### Nth Used Product

In [45]:
product_counts = df.groupby(['payer_account_id', 'product_name'])['product_name'].count().reset_index(name='count')

product_counts

Unnamed: 0,payer_account_id,product_name,count
0,1056,Dish Home Topup,1
1,1056,NT Prepaid Topup,27
2,1056,Ncell Topup,2
3,1056,Nepal Electricity Authority,4
4,1056,Send Money,4
...,...,...,...
219,56,eSewa to Garima Dev. Bank,1
220,56,eSewa to Global IME Bank,8
221,56,eSewa to NIC Asia Bank,7
222,56,eSewa to NMB Bank,6


In [46]:
sorted_product_counts = product_counts.sort_values(by=['payer_account_id', 'count'], ascending=[True, False])

most_used_product = sorted_product_counts.groupby('payer_account_id').nth(0).reset_index()
second_most_used_product = sorted_product_counts.groupby('payer_account_id').nth(1).reset_index()
third_most_used_product = sorted_product_counts.groupby('payer_account_id').nth(2).reset_index()

In [47]:
most_used_product 

Unnamed: 0,index,payer_account_id,product_name,count
0,1,1056,NT Prepaid Topup,27
1,6,1176,Ncell Topup,1
2,8,222,NT Postpaid Topup,16
3,27,26,Send Money,622
4,61,34,Send Money,6926
5,85,471,NT Prepaid Topup,638
6,150,531,NT Topup via Bank,59806
7,194,538,NT Prepaid Topup,1
8,210,56,Nepal Electricity Authority,1233


In [48]:
result = pd.merge(most_used_product, second_most_used_product, on='payer_account_id', suffixes=('_most', '_second'), how='left')
result = pd.merge(result, third_most_used_product, on='payer_account_id', how='left')

In [49]:
result = result[['payer_account_id', 'product_name_most', 'product_name_second', 'product_name']]
result.columns = ['payer_account_id', 'most_used_product', 'second_most_used_product', 'third_most_used_product']
result

Unnamed: 0,payer_account_id,most_used_product,second_most_used_product,third_most_used_product
0,1056,NT Prepaid Topup,Nepal Electricity Authority,Send Money
1,1176,Ncell Topup,,
2,222,NT Postpaid Topup,Ncell Topup,NT Prepaid Topup
3,26,Send Money,NT Prepaid Topup,Cash In
4,34,Send Money,Nepal Electricity Authority,NT Prepaid Topup
5,471,NT Prepaid Topup,Ncell Topup,Send Money
6,531,NT Topup via Bank,Ncell Topup via Bank,Wordlink Topup via Bank
7,538,NT Prepaid Topup,,
8,56,Nepal Electricity Authority,Send Money,Ncell Topup
