#### Feature Engineering  

Dataset: 
- _calls_clean.csv_
- _internet_clean.csv_
- _messages_clean.csv_
- _plans_clean.csv_
- _users_clean.csv_

Author: Luis Sergio Pastrana Lemus  
Date: 2025-05-14

# Feature engineering – Purchasing Activity Dataset

## __1. Libraries__

In [1]:
from IPython.display import display, HTML
from functools import partial
import numpy as np
import os
import pandas as pd
from pathlib import Path
import sys

# Define project root dynamically, gets the current directory from whick the notebook belongs and moves one level upper
project_root = Path.cwd().parent

# Add src to sys.path if it is not already
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import function directly (more controlled than import *)
from src import *

## __2. Path to Data file__

In [2]:
# Build route to data file and upload
data_file_path = project_root / "data" / "processed" / "clean"
df_calls = load_dataset_from_csv(data_file_path, "calls_clean.csv", sep=',', header='infer')
df_internet = load_dataset_from_csv(data_file_path, "internet_clean.csv", sep=',', header='infer')
df_messages = load_dataset_from_csv(data_file_path, "messages_clean.csv", sep=',', header='infer')
df_plans = load_dataset_from_csv(data_file_path, "plans_clean.csv", sep=',', header='infer')
df_users = load_dataset_from_csv(data_file_path, "users_clean.csv", sep=',', header='infer')


In [3]:
# Format notebook output
format_notebook()

## 3 __Functions__

In [4]:
# Function for calculating the revenue per user
def revenue(row, revenue_type, column):
    # Plan parameters
    plans = {
        'surf': {
            'base_fee': 20, 'minutes_limit': 500, 'messages_limit': 50, 'mb_limit': 15360,
            'minute_over': 0.03, 'message_over': 0.03, 'mb_over': 0.01
        },
        'ultimate': {
            'base_fee': 70, 'minutes_limit': 3000, 'messages_limit': 1000, 'mb_limit': 30720,
            'minute_over': 0.01, 'message_over': 0.01, 'mb_over': 0.007
        }
    }
    
    # Basic validations

    if revenue_type not in {'total', 'duration', 'messages', 'traffic'}:
        display(HTML(f"> <b>Invalid input</b>: <i>{revenue_type}</i>, correct options: (<b>'total', 'duration', 'messages', 'traffic'</b>)"))

    plan = plans[row['plan']]
    if not plan:
        display(HTML(f"> Invalid plan: {row['plan']}, correct options: ('surf', 'ultimate')"))
    
    # Calculate overages
    if revenue_type == 'duration':
        income_duration = max(0, row[column] - plan['minutes_limit']) * plan['minute_over']
        return plan['base_fee'] + income_duration
    elif revenue_type == 'messages':
        income_messages = max(0, row[column] - plan['messages_limit']) * plan['message_over']
        return plan['base_fee'] + income_messages
    elif revenue_type == 'traffic':
        income_traffic = max(0, row[column] - plan['mb_limit']) * plan['mb_over']
        return plan['base_fee'] + income_traffic
    elif revenue_type == 'total':
        income_duration = max(0, row[column[0]] - plan['minutes_limit']) * plan['minute_over']
        income_messages = max(0, row[column[1]] - plan['messages_limit']) * plan['message_over']
        income_traffic = max(0, row[column[2]] - plan['mb_limit']) * plan['mb_over']
        return plan['base_fee'] + income_duration + income_messages + income_traffic
    else:
        raise ValueError(f"Invalid income_type: {revenue_type}")



## 4 __Casting to data types__

### 3.1 Casting to string data type

In [5]:
# df_users 'first_name' to string
df_users['first_name'] = df_users['first_name'].astype('string')
df_users['first_name'].dtypes

string[python]

In [6]:
# df_users 'last_name' to string
df_users['last_name'] = df_users['last_name'].astype('string')
df_users['last_name'].dtypes

string[python]

In [7]:
# df_users 'city to string
df_users['city'] = df_users['city'].astype('string')
df_users['city'].dtypes

string[python]

### 3.2 Casting to category data type

In [8]:
# df_plans 'plan_name' to category
df_plans['plan_name'] = df_plans['plan_name'].astype('category')
df_plans['plan_name'].dtypes


CategoricalDtype(categories=['surf', 'ultimate'], ordered=False, categories_dtype=object)

In [9]:
# df_users 'plan' to category
df_users['plan'] = df_users['plan'].astype('category')
df_users['plan'].dtypes

CategoricalDtype(categories=['surf', 'ultimate'], ordered=False, categories_dtype=object)

### 3.3 Casting to datetime data type

In [10]:
# df_calls 'call_date' to datetime
df_calls = normalize_datetime(df_calls, include=['call_date'], frmt='%Y-%m-%d')
df_calls.dtypes

call_id                   object
user_id                    int64
call_date    datetime64[ns, UTC]
duration                 float64
dtype: object

In [11]:
# df_internet 'session_date' to datetime
df_internet = normalize_datetime(df_internet, include=['session_date'], frmt='%Y-%m-%d')
df_internet.dtypes

session_id                   object
user_id                       int64
session_date    datetime64[ns, UTC]
mb_used                     float64
dtype: object

In [12]:
# df_messages 'messages_date' to datetime
df_messages = normalize_datetime(df_messages, include=['message_date'], frmt='%Y-%m-%d')
df_messages.dtypes

message_id                   object
user_id                       int64
message_date    datetime64[ns, UTC]
dtype: object

In [13]:
# df_users 'reg_date' to datetime
df_users = normalize_datetime(df_users, include=['reg_date'], frmt='%Y-%m-%d')


In [14]:
# df_users 'churn_date' to datetime
df_users = normalize_datetime(df_users, include=['churn_date'], frmt='%Y-%m-%d')
df_users.dtypes

user_id                     int64
first_name         string[python]
last_name          string[python]
age                         int64
city               string[python]
reg_date      datetime64[ns, UTC]
plan                     category
churn_date    datetime64[ns, UTC]
dtype: object

## 4. Feature Engineering

### 4.1 Datasets

#### 4.1.1 Calls

In [15]:
df_calls

Unnamed: 0,call_id,user_id,call_date,duration
0,1000_93,1000,2018-12-27 00:00:00+00:00,8.52
1,1000_145,1000,2018-12-27 00:00:00+00:00,13.66
2,1000_247,1000,2018-12-27 00:00:00+00:00,14.48
3,1000_309,1000,2018-12-28 00:00:00+00:00,5.76
4,1000_380,1000,2018-12-30 00:00:00+00:00,4.22
...,...,...,...,...
137730,1499_199,1499,2018-11-21 00:00:00+00:00,8.72
137731,1499_200,1499,2018-10-20 00:00:00+00:00,10.89
137732,1499_201,1499,2018-09-21 00:00:00+00:00,8.12
137733,1499_202,1499,2018-10-10 00:00:00+00:00,0.37


In [16]:
# Each individual call is rounded up: even if the call lasted only one second
df_calls['duration'] = np.ceil(df_calls['duration'])

In [17]:
# Add Month
df_calls['month'] = df_calls['call_date'].dt.month

In [18]:
# Add Day
df_calls['day'] = df_calls['call_date'].dt.day

In [19]:
df_calls

Unnamed: 0,call_id,user_id,call_date,duration,month,day
0,1000_93,1000,2018-12-27 00:00:00+00:00,9.0,12,27
1,1000_145,1000,2018-12-27 00:00:00+00:00,14.0,12,27
2,1000_247,1000,2018-12-27 00:00:00+00:00,15.0,12,27
3,1000_309,1000,2018-12-28 00:00:00+00:00,6.0,12,28
4,1000_380,1000,2018-12-30 00:00:00+00:00,5.0,12,30
...,...,...,...,...,...,...
137730,1499_199,1499,2018-11-21 00:00:00+00:00,9.0,11,21
137731,1499_200,1499,2018-10-20 00:00:00+00:00,11.0,10,20
137732,1499_201,1499,2018-09-21 00:00:00+00:00,9.0,9,21
137733,1499_202,1499,2018-10-10 00:00:00+00:00,1.0,10,10


In [20]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "feature" / "calls_feature.csv"

df_calls.to_csv(processed_path, index=False)

#### 4.1.2 Internet

In [21]:
df_internet

Unnamed: 0,session_id,user_id,session_date,mb_used
0,1000_13,1000,2018-12-29 00:00:00+00:00,89.86
1,1000_204,1000,2018-12-31 00:00:00+00:00,0.00
2,1000_379,1000,2018-12-28 00:00:00+00:00,660.40
3,1000_413,1000,2018-12-26 00:00:00+00:00,270.99
4,1000_442,1000,2018-12-27 00:00:00+00:00,880.22
...,...,...,...,...
104820,1499_215,1499,2018-10-20 00:00:00+00:00,218.06
104821,1499_216,1499,2018-12-30 00:00:00+00:00,304.72
104822,1499_217,1499,2018-09-22 00:00:00+00:00,292.75
104823,1499_218,1499,2018-12-07 00:00:00+00:00,0.00


In [22]:
# Add Month
df_internet['month'] = df_internet['session_date'].dt.month

In [23]:
# Add Day
df_internet['day'] = df_internet['session_date'].dt.day

In [24]:
df_internet

Unnamed: 0,session_id,user_id,session_date,mb_used,month,day
0,1000_13,1000,2018-12-29 00:00:00+00:00,89.86,12,29
1,1000_204,1000,2018-12-31 00:00:00+00:00,0.00,12,31
2,1000_379,1000,2018-12-28 00:00:00+00:00,660.40,12,28
3,1000_413,1000,2018-12-26 00:00:00+00:00,270.99,12,26
4,1000_442,1000,2018-12-27 00:00:00+00:00,880.22,12,27
...,...,...,...,...,...,...
104820,1499_215,1499,2018-10-20 00:00:00+00:00,218.06,10,20
104821,1499_216,1499,2018-12-30 00:00:00+00:00,304.72,12,30
104822,1499_217,1499,2018-09-22 00:00:00+00:00,292.75,9,22
104823,1499_218,1499,2018-12-07 00:00:00+00:00,0.00,12,7


In [25]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "feature" / "internet_feature.csv"

df_internet.to_csv(processed_path, index=False)

#### 4.1.3 Messages

In [26]:
df_messages

Unnamed: 0,message_id,user_id,message_date
0,1000_125,1000,2018-12-27 00:00:00+00:00
1,1000_160,1000,2018-12-31 00:00:00+00:00
2,1000_223,1000,2018-12-31 00:00:00+00:00
3,1000_251,1000,2018-12-27 00:00:00+00:00
4,1000_255,1000,2018-12-26 00:00:00+00:00
...,...,...,...
76046,1497_526,1497,2018-12-24 00:00:00+00:00
76047,1497_536,1497,2018-12-24 00:00:00+00:00
76048,1497_547,1497,2018-12-31 00:00:00+00:00
76049,1497_558,1497,2018-12-24 00:00:00+00:00


In [27]:
# Add Month
df_messages['month'] = df_messages['message_date'].dt.month

In [28]:
# Add Day
df_messages['day'] = df_messages['message_date'].dt.day

In [29]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "feature" / "messages_feature.csv"

df_messages.to_csv(processed_path, index=False)

#### 4.1.4 Users

In [30]:
df_users

Unnamed: 0,user_id,first_name,last_name,age,city,reg_date,plan,churn_date
0,1000,anamaria,bauer,45,"atlanta_sandy_springs_roswell,_ga_msa",2018-12-24 00:00:00+00:00,ultimate,NaT
1,1001,mickey,wilkerson,28,"seattle_tacoma_bellevue,_wa_msa",2018-08-13 00:00:00+00:00,surf,NaT
2,1002,carlee,hoffman,36,"las_vegas_henderson_paradise,_nv_msa",2018-10-21 00:00:00+00:00,surf,NaT
3,1003,reynaldo,jenkins,52,"tulsa,_ok_msa",2018-01-28 00:00:00+00:00,surf,NaT
4,1004,leonila,thompson,40,"seattle_tacoma_bellevue,_wa_msa",2018-05-23 00:00:00+00:00,surf,NaT
...,...,...,...,...,...,...,...,...
495,1495,fidel,sharpe,67,"new_york_newark_jersey_city,_ny_nj_pa_msa",2018-09-04 00:00:00+00:00,surf,NaT
496,1496,ariel,shepherd,49,"new_orleans_metairie,_la_msa",2018-02-20 00:00:00+00:00,surf,NaT
497,1497,donte,barrera,49,"los_angeles_long_beach_anaheim,_ca_msa",2018-12-10 00:00:00+00:00,ultimate,NaT
498,1498,scot,williamson,51,"new_york_newark_jersey_city,_ny_nj_pa_msa",2018-02-04 00:00:00+00:00,surf,NaT


In [31]:
# Full Name
df_users['full_name'] = df_users['first_name'] + "_" + df_users['last_name']

In [32]:
# Add Month 'reg_date'
df_users['reg_month'] = df_users['reg_date'].dt.month

In [33]:
# Add Day 'reg_date'
df_users['reg_day'] = df_users['reg_date'].dt.day

In [34]:
# Add Month 'churn_date'
df_users['churn_month'] = df_users['churn_date'].dt.month

In [35]:
# Add Day 'churn_date'
df_users['churn_day'] = df_users['churn_date'].dt.day

In [36]:
df_users

Unnamed: 0,user_id,first_name,last_name,age,city,reg_date,plan,churn_date,full_name,reg_month,reg_day,churn_month,churn_day
0,1000,anamaria,bauer,45,"atlanta_sandy_springs_roswell,_ga_msa",2018-12-24 00:00:00+00:00,ultimate,NaT,anamaria_bauer,12,24,,
1,1001,mickey,wilkerson,28,"seattle_tacoma_bellevue,_wa_msa",2018-08-13 00:00:00+00:00,surf,NaT,mickey_wilkerson,8,13,,
2,1002,carlee,hoffman,36,"las_vegas_henderson_paradise,_nv_msa",2018-10-21 00:00:00+00:00,surf,NaT,carlee_hoffman,10,21,,
3,1003,reynaldo,jenkins,52,"tulsa,_ok_msa",2018-01-28 00:00:00+00:00,surf,NaT,reynaldo_jenkins,1,28,,
4,1004,leonila,thompson,40,"seattle_tacoma_bellevue,_wa_msa",2018-05-23 00:00:00+00:00,surf,NaT,leonila_thompson,5,23,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1495,fidel,sharpe,67,"new_york_newark_jersey_city,_ny_nj_pa_msa",2018-09-04 00:00:00+00:00,surf,NaT,fidel_sharpe,9,4,,
496,1496,ariel,shepherd,49,"new_orleans_metairie,_la_msa",2018-02-20 00:00:00+00:00,surf,NaT,ariel_shepherd,2,20,,
497,1497,donte,barrera,49,"los_angeles_long_beach_anaheim,_ca_msa",2018-12-10 00:00:00+00:00,ultimate,NaT,donte_barrera,12,10,,
498,1498,scot,williamson,51,"new_york_newark_jersey_city,_ny_nj_pa_msa",2018-02-04 00:00:00+00:00,surf,NaT,scot_williamson,2,4,,


In [37]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "feature" / "users_feature.csv"

df_users.to_csv(processed_path, index=False)

### 4.2 Calls activity

In [38]:
df_calls

Unnamed: 0,call_id,user_id,call_date,duration,month,day
0,1000_93,1000,2018-12-27 00:00:00+00:00,9.0,12,27
1,1000_145,1000,2018-12-27 00:00:00+00:00,14.0,12,27
2,1000_247,1000,2018-12-27 00:00:00+00:00,15.0,12,27
3,1000_309,1000,2018-12-28 00:00:00+00:00,6.0,12,28
4,1000_380,1000,2018-12-30 00:00:00+00:00,5.0,12,30
...,...,...,...,...,...,...
137730,1499_199,1499,2018-11-21 00:00:00+00:00,9.0,11,21
137731,1499_200,1499,2018-10-20 00:00:00+00:00,11.0,10,20
137732,1499_201,1499,2018-09-21 00:00:00+00:00,9.0,9,21
137733,1499_202,1499,2018-10-10 00:00:00+00:00,1.0,10,10


#### 4.2.1 Calls amount and duration per user

In [39]:
df_calls_user_total = df_calls.groupby('user_id')['call_id'].count().reset_index()
df_calls_user_total = df_calls_user_total.rename(columns={'call_id': 'total_calls'})
df_calls_user_total

Unnamed: 0,user_id,total_calls
0,1000,16
1,1001,261
2,1002,113
3,1003,149
4,1004,370
...,...,...
476,1495,253
477,1496,195
478,1497,54
479,1498,451


In [40]:
df_duration_user_month = df_calls.groupby(['user_id', 'month'])['duration'].sum().reset_index()
df_duration_user_month = df_duration_user_month.rename(columns={'duration': 'total_duration_month'})
df_duration_user_month

Unnamed: 0,user_id,month,total_duration_month
0,1000,12,124.0
1,1001,8,182.0
2,1001,9,315.0
3,1001,10,393.0
4,1001,11,426.0
...,...,...,...
2253,1498,12,339.0
2254,1499,9,346.0
2255,1499,10,385.0
2256,1499,11,308.0


In [41]:
df_duration_user_month = df_duration_user_month.merge(df_users[['user_id', 'plan']], on='user_id', how='left')
df_duration_user_month

Unnamed: 0,user_id,month,total_duration_month,plan
0,1000,12,124.0,ultimate
1,1001,8,182.0,surf
2,1001,9,315.0,surf
3,1001,10,393.0,surf
4,1001,11,426.0,surf
...,...,...,...,...
2253,1498,12,339.0,surf
2254,1499,9,346.0,surf
2255,1499,10,385.0,surf
2256,1499,11,308.0,surf


In [42]:
df_duration_user_month['total_duration_revenue_month'] = np.round(df_duration_user_month.apply(lambda row: revenue(row, 'duration', 'total_duration_month'), axis=1), 3)
df_duration_user_month

Unnamed: 0,user_id,month,total_duration_month,plan,total_duration_revenue_month
0,1000,12,124.0,ultimate,70.0
1,1001,8,182.0,surf,20.0
2,1001,9,315.0,surf,20.0
3,1001,10,393.0,surf,20.0
4,1001,11,426.0,surf,20.0
...,...,...,...,...,...
2253,1498,12,339.0,surf,20.0
2254,1499,9,346.0,surf,20.0
2255,1499,10,385.0,surf,20.0
2256,1499,11,308.0,surf,20.0


In [43]:
df_duration_user_total = df_duration_user_month.groupby('user_id').agg({'total_duration_month': 'sum', 'total_duration_revenue_month': 'sum'}).reset_index()
df_duration_user_total

Unnamed: 0,user_id,total_duration_month,total_duration_revenue_month
0,1000,124.0,70.00
1,1001,1728.0,100.00
2,1002,829.0,60.00
3,1003,1104.0,38.12
4,1004,2772.0,160.00
...,...,...,...
476,1495,1765.0,83.30
477,1496,1455.0,100.00
478,1497,300.0,70.00
479,1498,3211.0,220.00


In [44]:
df_calls_duration_total = pd.merge(df_calls_user_total, df_duration_user_total, how='outer', on='user_id')
df_calls_duration_total = df_calls_duration_total.rename(columns={'total_duration_month': 'total_duration', 'total_duration_revenue_month': 'total_duration_revenue'})
df_calls_duration_total

Unnamed: 0,user_id,total_calls,total_duration,total_duration_revenue
0,1000,16,124.0,70.00
1,1001,261,1728.0,100.00
2,1002,113,829.0,60.00
3,1003,149,1104.0,38.12
4,1004,370,2772.0,160.00
...,...,...,...,...
476,1495,253,1765.0,83.30
477,1496,195,1455.0,100.00
478,1497,54,300.0,70.00
479,1498,451,3211.0,220.00


In [45]:
df_calls_duration_total = df_calls_duration_total.merge(df_users[['user_id', 'plan']], on='user_id', how='left')
df_calls_duration_total

Unnamed: 0,user_id,total_calls,total_duration,total_duration_revenue,plan
0,1000,16,124.0,70.00,ultimate
1,1001,261,1728.0,100.00,surf
2,1002,113,829.0,60.00,surf
3,1003,149,1104.0,38.12,surf
4,1004,370,2772.0,160.00,surf
...,...,...,...,...,...
476,1495,253,1765.0,83.30,surf
477,1496,195,1455.0,100.00,surf
478,1497,54,300.0,70.00,ultimate
479,1498,451,3211.0,220.00,surf


In [46]:
df_calls_duration_total = df_calls_duration_total.merge(df_users[['user_id', 'city']], on='user_id', how='left')
df_calls_duration_total

Unnamed: 0,user_id,total_calls,total_duration,total_duration_revenue,plan,city
0,1000,16,124.0,70.00,ultimate,"atlanta_sandy_springs_roswell,_ga_msa"
1,1001,261,1728.0,100.00,surf,"seattle_tacoma_bellevue,_wa_msa"
2,1002,113,829.0,60.00,surf,"las_vegas_henderson_paradise,_nv_msa"
3,1003,149,1104.0,38.12,surf,"tulsa,_ok_msa"
4,1004,370,2772.0,160.00,surf,"seattle_tacoma_bellevue,_wa_msa"
...,...,...,...,...,...,...
476,1495,253,1765.0,83.30,surf,"new_york_newark_jersey_city,_ny_nj_pa_msa"
477,1496,195,1455.0,100.00,surf,"new_orleans_metairie,_la_msa"
478,1497,54,300.0,70.00,ultimate,"los_angeles_long_beach_anaheim,_ca_msa"
479,1498,451,3211.0,220.00,surf,"new_york_newark_jersey_city,_ny_nj_pa_msa"


In [47]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "total" / "calls_duration_total.csv"

df_calls_duration_total.to_csv(processed_path, index=False)

### 4.3 Messages activity

In [48]:
df_messages

Unnamed: 0,message_id,user_id,message_date,month,day
0,1000_125,1000,2018-12-27 00:00:00+00:00,12,27
1,1000_160,1000,2018-12-31 00:00:00+00:00,12,31
2,1000_223,1000,2018-12-31 00:00:00+00:00,12,31
3,1000_251,1000,2018-12-27 00:00:00+00:00,12,27
4,1000_255,1000,2018-12-26 00:00:00+00:00,12,26
...,...,...,...,...,...
76046,1497_526,1497,2018-12-24 00:00:00+00:00,12,24
76047,1497_536,1497,2018-12-24 00:00:00+00:00,12,24
76048,1497_547,1497,2018-12-31 00:00:00+00:00,12,31
76049,1497_558,1497,2018-12-24 00:00:00+00:00,12,24


#### 4.3.1 Messages amount per user

In [49]:
df_messages_user_month = df_messages.groupby(['user_id', 'month'])['message_id'].count().reset_index()
df_messages_user_month = df_messages_user_month.rename(columns={'message_id': 'total_messages_month'})
df_messages_user_month

Unnamed: 0,user_id,month,total_messages_month
0,1000,12,11
1,1001,8,30
2,1001,9,44
3,1001,10,53
4,1001,11,36
...,...,...,...
1801,1496,9,21
1802,1496,10,18
1803,1496,11,13
1804,1496,12,11


In [50]:
df_messages_user_month = df_messages_user_month.merge(df_users[['user_id', 'plan']], on='user_id', how='left')
df_messages_user_month

Unnamed: 0,user_id,month,total_messages_month,plan
0,1000,12,11,ultimate
1,1001,8,30,surf
2,1001,9,44,surf
3,1001,10,53,surf
4,1001,11,36,surf
...,...,...,...,...
1801,1496,9,21,surf
1802,1496,10,18,surf
1803,1496,11,13,surf
1804,1496,12,11,surf


In [51]:
df_messages_user_month['total_messages_revenue_month'] = np.round(df_messages_user_month.apply(lambda row: revenue(row, 'messages', 'total_messages_month'), axis=1), 3)
df_messages_user_month

Unnamed: 0,user_id,month,total_messages_month,plan,total_messages_revenue_month
0,1000,12,11,ultimate,70.00
1,1001,8,30,surf,20.00
2,1001,9,44,surf,20.00
3,1001,10,53,surf,20.09
4,1001,11,36,surf,20.00
...,...,...,...,...,...
1801,1496,9,21,surf,20.00
1802,1496,10,18,surf,20.00
1803,1496,11,13,surf,20.00
1804,1496,12,11,surf,20.00


In [52]:
df_messages_user_total = df_messages_user_month.groupby('user_id').agg({'total_messages_month': 'sum', 'total_messages_revenue_month': 'sum'}).reset_index()
df_messages_user_total = df_messages_user_total.rename(columns={'total_messages_month': 'total_messages', 'total_messages_revenue_month': 'total_messages_revenue'} )
df_messages_user_total

Unnamed: 0,user_id,total_messages,total_messages_revenue
0,1000,11,70.00
1,1001,207,100.09
2,1002,88,60.00
3,1003,50,20.00
4,1004,177,160.00
...,...,...,...
397,1491,409,180.57
398,1492,108,80.00
399,1494,174,120.00
400,1496,65,100.00


In [53]:
df_messages_user_total = df_messages_user_total.merge(df_users[['user_id', 'plan']], on='user_id', how='left')
df_messages_user_total

Unnamed: 0,user_id,total_messages,total_messages_revenue,plan
0,1000,11,70.00,ultimate
1,1001,207,100.09,surf
2,1002,88,60.00,surf
3,1003,50,20.00,surf
4,1004,177,160.00,surf
...,...,...,...,...
397,1491,409,180.57,surf
398,1492,108,80.00,surf
399,1494,174,120.00,surf
400,1496,65,100.00,surf


In [54]:
df_messages_user_total = df_messages_user_total.merge(df_users[['user_id', 'city']], on='user_id', how='left')
df_messages_user_total

Unnamed: 0,user_id,total_messages,total_messages_revenue,plan,city
0,1000,11,70.00,ultimate,"atlanta_sandy_springs_roswell,_ga_msa"
1,1001,207,100.09,surf,"seattle_tacoma_bellevue,_wa_msa"
2,1002,88,60.00,surf,"las_vegas_henderson_paradise,_nv_msa"
3,1003,50,20.00,surf,"tulsa,_ok_msa"
4,1004,177,160.00,surf,"seattle_tacoma_bellevue,_wa_msa"
...,...,...,...,...,...
397,1491,409,180.57,surf,"grand_rapids_kentwood,_mi_msa"
398,1492,108,80.00,surf,"portland_vancouver_hillsboro,_or_wa_msa"
399,1494,174,120.00,surf,"new_york_newark_jersey_city,_ny_nj_pa_msa"
400,1496,65,100.00,surf,"new_orleans_metairie,_la_msa"


In [55]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "total" / "messages_total.csv"

df_messages_user_total.to_csv(processed_path, index=False)

### 4.4 Internet activity

In [56]:
df_internet

Unnamed: 0,session_id,user_id,session_date,mb_used,month,day
0,1000_13,1000,2018-12-29 00:00:00+00:00,89.86,12,29
1,1000_204,1000,2018-12-31 00:00:00+00:00,0.00,12,31
2,1000_379,1000,2018-12-28 00:00:00+00:00,660.40,12,28
3,1000_413,1000,2018-12-26 00:00:00+00:00,270.99,12,26
4,1000_442,1000,2018-12-27 00:00:00+00:00,880.22,12,27
...,...,...,...,...,...,...
104820,1499_215,1499,2018-10-20 00:00:00+00:00,218.06,10,20
104821,1499_216,1499,2018-12-30 00:00:00+00:00,304.72,12,30
104822,1499_217,1499,2018-09-22 00:00:00+00:00,292.75,9,22
104823,1499_218,1499,2018-12-07 00:00:00+00:00,0.00,12,7


#### 4.4.1 Internet sessions and traffic per user

In [57]:
df_sessions_user_total = df_internet.groupby('user_id')['session_id'].count().reset_index()
df_sessions_user_total = df_sessions_user_total.rename(columns={'session_id': 'total_sessions'})
df_sessions_user_total

Unnamed: 0,user_id,total_sessions
0,1000,5
1,1001,245
2,1002,124
3,1003,52
4,1004,460
...,...,...
484,1495,290
485,1496,225
486,1497,31
487,1498,657


In [58]:
# Individual web sessions are not rounded up. Instead, the monthly total is rounded up
df_traffic_user_month = df_internet.groupby(['user_id', 'month'])['mb_used'].sum().reset_index()
df_traffic_user_month = df_traffic_user_month.rename(columns={'mb_used': 'total_traffic_month'})
df_traffic_user_month['total_traffic_month'] = np.ceil(df_traffic_user_month['total_traffic_month'] / 1000) * 1000
df_traffic_user_month

Unnamed: 0,user_id,month,total_traffic_month
0,1000,12,2000.0
1,1001,8,7000.0
2,1001,9,14000.0
3,1001,10,23000.0
4,1001,11,19000.0
...,...,...,...
2272,1498,12,24000.0
2273,1499,9,13000.0
2274,1499,10,20000.0
2275,1499,11,17000.0


In [59]:
df_traffic_user_month = df_traffic_user_month.merge(df_users[['user_id', 'plan']], on='user_id', how='left')
df_traffic_user_month

Unnamed: 0,user_id,month,total_traffic_month,plan
0,1000,12,2000.0,ultimate
1,1001,8,7000.0,surf
2,1001,9,14000.0,surf
3,1001,10,23000.0,surf
4,1001,11,19000.0,surf
...,...,...,...,...
2272,1498,12,24000.0,surf
2273,1499,9,13000.0,surf
2274,1499,10,20000.0,surf
2275,1499,11,17000.0,surf


In [60]:
df_traffic_user_month['total_traffic_revenue_month'] = np.round(df_traffic_user_month.apply(lambda row: revenue(row, 'traffic', 'total_traffic_month'), axis=1), 3)
df_traffic_user_month

Unnamed: 0,user_id,month,total_traffic_month,plan,total_traffic_revenue_month
0,1000,12,2000.0,ultimate,70.0
1,1001,8,7000.0,surf,20.0
2,1001,9,14000.0,surf,20.0
3,1001,10,23000.0,surf,96.4
4,1001,11,19000.0,surf,56.4
...,...,...,...,...,...
2272,1498,12,24000.0,surf,106.4
2273,1499,9,13000.0,surf,20.0
2274,1499,10,20000.0,surf,66.4
2275,1499,11,17000.0,surf,36.4


In [61]:
df_traffic_user_total = np.round(df_traffic_user_month.groupby('user_id').agg({'total_traffic_month': 'sum', 'total_traffic_revenue_month': 'sum'}).reset_index(), 3)
df_traffic_user_total = df_traffic_user_total.rename(columns={'total_traffic_month': 'total_traffic', 'total_traffic_revenue_month': 'total_traffic_revenue'})
df_traffic_user_total

Unnamed: 0,user_id,total_traffic,total_traffic_revenue
0,1000,2000.0,70.0
1,1001,83000.0,259.2
2,1002,42000.0,106.4
3,1003,28000.0,146.4
4,1004,159000.0,608.4
...,...,...,...
484,1495,100000.0,465.6
485,1496,67000.0,132.8
486,1497,12000.0,70.0
487,1498,232000.0,854.0


In [62]:
df_sessions_traffic_total = pd.merge(df_sessions_user_total, df_traffic_user_total, how='outer', on='user_id')
df_sessions_traffic_total

Unnamed: 0,user_id,total_sessions,total_traffic,total_traffic_revenue
0,1000,5,2000.0,70.0
1,1001,245,83000.0,259.2
2,1002,124,42000.0,106.4
3,1003,52,28000.0,146.4
4,1004,460,159000.0,608.4
...,...,...,...,...
484,1495,290,100000.0,465.6
485,1496,225,67000.0,132.8
486,1497,31,12000.0,70.0
487,1498,657,232000.0,854.0


In [63]:
df_sessions_traffic_total = df_sessions_traffic_total.merge(df_users[['user_id', 'plan']], on='user_id', how='left')
df_sessions_traffic_total

Unnamed: 0,user_id,total_sessions,total_traffic,total_traffic_revenue,plan
0,1000,5,2000.0,70.0,ultimate
1,1001,245,83000.0,259.2,surf
2,1002,124,42000.0,106.4,surf
3,1003,52,28000.0,146.4,surf
4,1004,460,159000.0,608.4,surf
...,...,...,...,...,...
484,1495,290,100000.0,465.6,surf
485,1496,225,67000.0,132.8,surf
486,1497,31,12000.0,70.0,ultimate
487,1498,657,232000.0,854.0,surf


In [64]:
df_sessions_traffic_total = df_sessions_traffic_total.merge(df_users[['user_id', 'city']], on='user_id', how='left')
df_sessions_traffic_total

Unnamed: 0,user_id,total_sessions,total_traffic,total_traffic_revenue,plan,city
0,1000,5,2000.0,70.0,ultimate,"atlanta_sandy_springs_roswell,_ga_msa"
1,1001,245,83000.0,259.2,surf,"seattle_tacoma_bellevue,_wa_msa"
2,1002,124,42000.0,106.4,surf,"las_vegas_henderson_paradise,_nv_msa"
3,1003,52,28000.0,146.4,surf,"tulsa,_ok_msa"
4,1004,460,159000.0,608.4,surf,"seattle_tacoma_bellevue,_wa_msa"
...,...,...,...,...,...,...
484,1495,290,100000.0,465.6,surf,"new_york_newark_jersey_city,_ny_nj_pa_msa"
485,1496,225,67000.0,132.8,surf,"new_orleans_metairie,_la_msa"
486,1497,31,12000.0,70.0,ultimate,"los_angeles_long_beach_anaheim,_ca_msa"
487,1498,657,232000.0,854.0,surf,"new_york_newark_jersey_city,_ny_nj_pa_msa"


In [65]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "total" / "internet_total.csv"

df_sessions_traffic_total.to_csv(processed_path, index=False)

### 4.5 Revenue

#### 4.5.1 Total revenue per user

In [66]:
df_revenue_total = df_users.loc[:, ['user_id', 'full_name', 'age', 'city', 'reg_day', 'reg_month', 'churn_day', 'churn_month', 'plan']]
df_revenue_total

Unnamed: 0,user_id,full_name,age,city,reg_day,reg_month,churn_day,churn_month,plan
0,1000,anamaria_bauer,45,"atlanta_sandy_springs_roswell,_ga_msa",24,12,,,ultimate
1,1001,mickey_wilkerson,28,"seattle_tacoma_bellevue,_wa_msa",13,8,,,surf
2,1002,carlee_hoffman,36,"las_vegas_henderson_paradise,_nv_msa",21,10,,,surf
3,1003,reynaldo_jenkins,52,"tulsa,_ok_msa",28,1,,,surf
4,1004,leonila_thompson,40,"seattle_tacoma_bellevue,_wa_msa",23,5,,,surf
...,...,...,...,...,...,...,...,...,...
495,1495,fidel_sharpe,67,"new_york_newark_jersey_city,_ny_nj_pa_msa",4,9,,,surf
496,1496,ariel_shepherd,49,"new_orleans_metairie,_la_msa",20,2,,,surf
497,1497,donte_barrera,49,"los_angeles_long_beach_anaheim,_ca_msa",10,12,,,ultimate
498,1498,scot_williamson,51,"new_york_newark_jersey_city,_ny_nj_pa_msa",4,2,,,surf


In [67]:
df_revenue_total = df_revenue_total.merge(df_calls_duration_total[['user_id', 'total_duration', 'total_duration_revenue']], on='user_id', how='left')
df_revenue_total = df_revenue_total.rename(columns={'total_duration': 'total_calls_duration', 'total_duration_revenue': 'total_calls_duration_revenue'})
df_revenue_total

Unnamed: 0,user_id,full_name,age,city,reg_day,reg_month,churn_day,churn_month,plan,total_calls_duration,total_calls_duration_revenue
0,1000,anamaria_bauer,45,"atlanta_sandy_springs_roswell,_ga_msa",24,12,,,ultimate,124.0,70.00
1,1001,mickey_wilkerson,28,"seattle_tacoma_bellevue,_wa_msa",13,8,,,surf,1728.0,100.00
2,1002,carlee_hoffman,36,"las_vegas_henderson_paradise,_nv_msa",21,10,,,surf,829.0,60.00
3,1003,reynaldo_jenkins,52,"tulsa,_ok_msa",28,1,,,surf,1104.0,38.12
4,1004,leonila_thompson,40,"seattle_tacoma_bellevue,_wa_msa",23,5,,,surf,2772.0,160.00
...,...,...,...,...,...,...,...,...,...,...,...
495,1495,fidel_sharpe,67,"new_york_newark_jersey_city,_ny_nj_pa_msa",4,9,,,surf,1765.0,83.30
496,1496,ariel_shepherd,49,"new_orleans_metairie,_la_msa",20,2,,,surf,1455.0,100.00
497,1497,donte_barrera,49,"los_angeles_long_beach_anaheim,_ca_msa",10,12,,,ultimate,300.0,70.00
498,1498,scot_williamson,51,"new_york_newark_jersey_city,_ny_nj_pa_msa",4,2,,,surf,3211.0,220.00


In [68]:
df_revenue_total = df_revenue_total.merge(df_messages_user_total[['user_id', 'total_messages', 'total_messages_revenue']], on='user_id', how='left')
df_revenue_total = df_revenue_total.rename(columns={'total_messages': 'total_messages_amount', 'total_messages_revenue': 'total_messages_amount_revenue'})
df_revenue_total

Unnamed: 0,user_id,full_name,age,city,reg_day,reg_month,churn_day,churn_month,plan,total_calls_duration,total_calls_duration_revenue,total_messages_amount,total_messages_amount_revenue
0,1000,anamaria_bauer,45,"atlanta_sandy_springs_roswell,_ga_msa",24,12,,,ultimate,124.0,70.00,11.0,70.00
1,1001,mickey_wilkerson,28,"seattle_tacoma_bellevue,_wa_msa",13,8,,,surf,1728.0,100.00,207.0,100.09
2,1002,carlee_hoffman,36,"las_vegas_henderson_paradise,_nv_msa",21,10,,,surf,829.0,60.00,88.0,60.00
3,1003,reynaldo_jenkins,52,"tulsa,_ok_msa",28,1,,,surf,1104.0,38.12,50.0,20.00
4,1004,leonila_thompson,40,"seattle_tacoma_bellevue,_wa_msa",23,5,,,surf,2772.0,160.00,177.0,160.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1495,fidel_sharpe,67,"new_york_newark_jersey_city,_ny_nj_pa_msa",4,9,,,surf,1765.0,83.30,,
496,1496,ariel_shepherd,49,"new_orleans_metairie,_la_msa",20,2,,,surf,1455.0,100.00,65.0,100.00
497,1497,donte_barrera,49,"los_angeles_long_beach_anaheim,_ca_msa",10,12,,,ultimate,300.0,70.00,50.0,70.00
498,1498,scot_williamson,51,"new_york_newark_jersey_city,_ny_nj_pa_msa",4,2,,,surf,3211.0,220.00,,


In [69]:
df_sessions_traffic_total

Unnamed: 0,user_id,total_sessions,total_traffic,total_traffic_revenue,plan,city
0,1000,5,2000.0,70.0,ultimate,"atlanta_sandy_springs_roswell,_ga_msa"
1,1001,245,83000.0,259.2,surf,"seattle_tacoma_bellevue,_wa_msa"
2,1002,124,42000.0,106.4,surf,"las_vegas_henderson_paradise,_nv_msa"
3,1003,52,28000.0,146.4,surf,"tulsa,_ok_msa"
4,1004,460,159000.0,608.4,surf,"seattle_tacoma_bellevue,_wa_msa"
...,...,...,...,...,...,...
484,1495,290,100000.0,465.6,surf,"new_york_newark_jersey_city,_ny_nj_pa_msa"
485,1496,225,67000.0,132.8,surf,"new_orleans_metairie,_la_msa"
486,1497,31,12000.0,70.0,ultimate,"los_angeles_long_beach_anaheim,_ca_msa"
487,1498,657,232000.0,854.0,surf,"new_york_newark_jersey_city,_ny_nj_pa_msa"


In [70]:
df_revenue_total = df_revenue_total.merge(df_sessions_traffic_total[['user_id', 'total_traffic', 'total_traffic_revenue']], on='user_id', how='left')
df_revenue_total = df_revenue_total.rename(columns={'total_traffic': 'total_traffic_usage', 'total_traffic_revenue': 'total_traffic_usage_revenue'})
df_revenue_total

Unnamed: 0,user_id,full_name,age,city,reg_day,reg_month,churn_day,churn_month,plan,total_calls_duration,total_calls_duration_revenue,total_messages_amount,total_messages_amount_revenue,total_traffic_usage,total_traffic_usage_revenue
0,1000,anamaria_bauer,45,"atlanta_sandy_springs_roswell,_ga_msa",24,12,,,ultimate,124.0,70.00,11.0,70.00,2000.0,70.0
1,1001,mickey_wilkerson,28,"seattle_tacoma_bellevue,_wa_msa",13,8,,,surf,1728.0,100.00,207.0,100.09,83000.0,259.2
2,1002,carlee_hoffman,36,"las_vegas_henderson_paradise,_nv_msa",21,10,,,surf,829.0,60.00,88.0,60.00,42000.0,106.4
3,1003,reynaldo_jenkins,52,"tulsa,_ok_msa",28,1,,,surf,1104.0,38.12,50.0,20.00,28000.0,146.4
4,1004,leonila_thompson,40,"seattle_tacoma_bellevue,_wa_msa",23,5,,,surf,2772.0,160.00,177.0,160.00,159000.0,608.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1495,fidel_sharpe,67,"new_york_newark_jersey_city,_ny_nj_pa_msa",4,9,,,surf,1765.0,83.30,,,100000.0,465.6
496,1496,ariel_shepherd,49,"new_orleans_metairie,_la_msa",20,2,,,surf,1455.0,100.00,65.0,100.00,67000.0,132.8
497,1497,donte_barrera,49,"los_angeles_long_beach_anaheim,_ca_msa",10,12,,,ultimate,300.0,70.00,50.0,70.00,12000.0,70.0
498,1498,scot_williamson,51,"new_york_newark_jersey_city,_ny_nj_pa_msa",4,2,,,surf,3211.0,220.00,,,232000.0,854.0


In [71]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "total" / "revenue_total.csv"

df_revenue_total.to_csv(processed_path, index=False)

#### 4.5.2 Periodic activity per user

In [72]:
df_calls_duration_periodic_day = df_calls.groupby(['user_id', 'month', 'day'])['duration'].sum().reset_index()
df_calls_duration_periodic_day = df_calls_duration_periodic_day.rename(columns={'duration': 'calls_duration'})
df_calls_duration_periodic_day

Unnamed: 0,user_id,month,day,calls_duration
0,1000,12,26,15.0
1,1000,12,27,50.0
2,1000,12,28,31.0
3,1000,12,29,2.0
4,1000,12,30,5.0
...,...,...,...,...
51954,1499,12,27,9.0
51955,1499,12,28,9.0
51956,1499,12,29,22.0
51957,1499,12,30,5.0


In [73]:
ds_calls_duration_periodic_day = df_calls_duration_periodic_day.groupby(['month', 'day'])['calls_duration'].sum()
ds_calls_duration_periodic_day

month  day
1      15       32.0
       16       73.0
       17       29.0
       18       53.0
       19       76.0
               ...  
12     27     7906.0
       28     7970.0
       29     7770.0
       30     7641.0
       31     7734.0
Name: calls_duration, Length: 351, dtype: float64

In [74]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "periodic" / "calls_duration_periodic_day.csv"

ds_calls_duration_periodic_day.to_csv(processed_path, index=True)

In [75]:
df_messages_periodic_day = df_messages.groupby(['user_id', 'month', 'day'])['message_id'].count().reset_index()
df_messages_periodic_day = df_messages_periodic_day.rename(columns={'message_id': 'message_count'})
df_messages_periodic_day

Unnamed: 0,user_id,month,day,message_count
0,1000,12,25,1
1,1000,12,26,1
2,1000,12,27,3
3,1000,12,28,1
4,1000,12,29,1
...,...,...,...,...
33238,1497,12,27,4
33239,1497,12,28,2
33240,1497,12,29,5
33241,1497,12,30,1


In [76]:
ds_messages_periodic_day = df_messages_periodic_day.groupby(['month', 'day'])['message_count'].sum()
ds_messages_periodic_day

month  day
1      15       2
       16       4
       17       6
       18       5
       19       1
             ... 
12     27     701
       28     702
       29     680
       30     683
       31     699
Name: message_count, Length: 351, dtype: int64

In [77]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "periodic" / "messages_periodic_day.csv"

ds_messages_periodic_day.to_csv(processed_path, index=True)

In [78]:
df_internet_periodic_day = df_internet.groupby(['user_id', 'month', 'day'])['mb_used'].sum().reset_index()
df_messages_periodic_day = df_messages_periodic_day.rename(columns={'mb_used': 'traffic_usage'})
df_internet_periodic_day

Unnamed: 0,user_id,month,day,mb_used
0,1000,12,26,270.99
1,1000,12,27,880.22
2,1000,12,28,660.40
3,1000,12,29,89.86
4,1000,12,31,0.00
...,...,...,...,...
49028,1499,12,25,924.14
49029,1499,12,26,823.03
49030,1499,12,29,2385.78
49031,1499,12,30,304.72


In [82]:
ds_internet_periodic_day = np.round(df_internet_periodic_day.groupby(['month', 'day'])['mb_used'].sum(), 3)
ds_internet_periodic_day

month  day
1      15       1193.90
       16       1776.09
       17       1202.33
       18        931.02
       19       2684.85
                ...    
12     27     291749.27
       28     291703.03
       29     288687.21
       30     324013.99
       31     304138.13
Name: mb_used, Length: 351, dtype: float64

In [83]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "periodic" / "internet_periodic_day.csv"

ds_internet_periodic_day.to_csv(processed_path, index=True)

#### 4.5.3 Periodic revenue per user

In [None]:
df_revenue_periodic = df_users.loc[:, ['user_id', 'full_name', 'age', 'city', 'reg_day', 'reg_month', 'churn_day', 'churn_month', 'plan']]
df_revenue_periodic

In [None]:
df_revenue_periodic = df_revenue_periodic.merge(df_calls_duration_periodic[['user_id', 'month', 'day', 'duration']], on='user_id', how='outer')
df_revenue_periodic = df_revenue_periodic.rename(columns={'duration': 'calls_duration'})
df_revenue_periodic

In [None]:
df_revenue_periodic = df_revenue_periodic.merge(df_messages_periodic[['user_id', 'month', 'day', 'message_count']], on=['user_id', 'month', 'day'], how='outer')
df_revenue_periodic = df_revenue_periodic.rename(columns={'message_count': 'messages_amount'})
df_revenue_periodic

In [None]:
dict_user_to_name = df_revenue_periodic.dropna(subset=['full_name']).drop_duplicates('user_id').set_index('user_id')['full_name'].to_dict()
df_revenue_periodic['full_name'] = df_revenue_periodic['full_name'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_name))


In [None]:
dict_user_to_age = df_revenue_periodic.dropna(subset=['age']).drop_duplicates('user_id').set_index('user_id')['age'].to_dict()
df_revenue_periodic['age'] = df_revenue_periodic['age'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_age))


In [None]:
dict_user_to_city = df_revenue_periodic.dropna(subset=['city']).drop_duplicates('user_id').set_index('user_id')['city'].to_dict()
df_revenue_periodic['city'] = df_revenue_periodic['city'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_city))


In [None]:
dict_user_to_regday = df_revenue_periodic.dropna(subset=['reg_day']).drop_duplicates('user_id').set_index('user_id')['reg_day'].to_dict()
df_revenue_periodic['reg_day'] = df_revenue_periodic['reg_day'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_regday))


In [None]:
dict_user_to_regmonth = df_revenue_periodic.dropna(subset=['reg_month']).drop_duplicates('user_id').set_index('user_id')['reg_month'].to_dict()
df_revenue_periodic['reg_month'] = df_revenue_periodic['reg_month'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_regmonth))


In [None]:
dict_user_to_churnday = df_revenue_periodic.dropna(subset=['churn_day']).drop_duplicates('user_id').set_index('user_id')['churn_day'].to_dict()
df_revenue_periodic['churn_day'] = df_revenue_periodic['churn_day'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_churnday))


In [None]:
dict_user_to_churnmonth = df_revenue_periodic.dropna(subset=['churn_month']).drop_duplicates('user_id').set_index('user_id')['churn_month'].to_dict()
df_revenue_periodic['churn_month'] = df_revenue_periodic['churn_month'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_churnmonth))


In [None]:
dict_user_to_plan = df_revenue_periodic.dropna(subset=['plan']).drop_duplicates('user_id').set_index('user_id')['plan'].to_dict()
df_revenue_periodic['plan'] = df_revenue_periodic['plan'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_plan))
df_revenue_periodic

In [None]:
df_revenue_periodic = df_revenue_periodic.merge(df_internet_periodic[['user_id', 'month', 'day', 'mb_used']], on=['user_id', 'month', 'day'], how='outer')
df_revenue_periodic = df_revenue_periodic.rename(columns={'mb_used': 'traffic_usage'})
df_revenue_periodic

In [None]:
dict_user_to_name = df_revenue_periodic.dropna(subset=['full_name']).drop_duplicates('user_id').set_index('user_id')['full_name'].to_dict()
df_revenue_periodic['full_name'] = df_revenue_periodic['full_name'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_name))


In [None]:
dict_user_to_age = df_revenue_periodic.dropna(subset=['age']).drop_duplicates('user_id').set_index('user_id')['age'].to_dict()
df_revenue_periodic['age'] = df_revenue_periodic['age'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_age))


In [None]:
dict_user_to_city = df_revenue_periodic.dropna(subset=['city']).drop_duplicates('user_id').set_index('user_id')['city'].to_dict()
df_revenue_periodic['city'] = df_revenue_periodic['city'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_city))


In [None]:
dict_user_to_regday = df_revenue_periodic.dropna(subset=['reg_day']).drop_duplicates('user_id').set_index('user_id')['reg_day'].to_dict()
df_revenue_periodic['reg_day'] = df_revenue_periodic['reg_day'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_regday))


In [None]:
dict_user_to_regmonth = df_revenue_periodic.dropna(subset=['reg_month']).drop_duplicates('user_id').set_index('user_id')['reg_month'].to_dict()
df_revenue_periodic['reg_month'] = df_revenue_periodic['reg_month'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_regmonth))


In [None]:
dict_user_to_churnday = df_revenue_periodic.dropna(subset=['churn_day']).drop_duplicates('user_id').set_index('user_id')['churn_day'].to_dict()
df_revenue_periodic['churn_day'] = df_revenue_periodic['churn_day'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_churnday))


In [None]:
dict_user_to_churnmonth = df_revenue_periodic.dropna(subset=['churn_month']).drop_duplicates('user_id').set_index('user_id')['churn_month'].to_dict()
df_revenue_periodic['churn_month'] = df_revenue_periodic['churn_month'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_churnmonth))


In [None]:
dict_user_to_plan = df_revenue_periodic.dropna(subset=['plan']).drop_duplicates('user_id').set_index('user_id')['plan'].to_dict()
df_revenue_periodic['plan'] = df_revenue_periodic['plan'].fillna(df_revenue_periodic['user_id'].map(dict_user_to_plan))
df_revenue_periodic

In [None]:
df_revenue_periodic[['calls_duration', 'messages_amount', 'traffic_usage']] = df_revenue_periodic[['calls_duration', 'messages_amount', 'traffic_usage']].fillna(0)
df_revenue_periodic

In [None]:
#
print(df_revenue_periodic[['plan', 'calls_duration', 'messages_amount', 'traffic_usage']].isna().sum())
print(df_revenue_periodic[['plan', 'calls_duration', 'messages_amount', 'traffic_usage']].dtypes)
print(df_revenue_periodic[['plan', 'calls_duration', 'messages_amount', 'traffic_usage']].head(10))


In [None]:
calls_revenue_fn = partial(revenue, revenue_type='duration')
traffic_revenue_fn = partial(revenue, revenue_type='traffic')
messages_revenue_fn = partial(revenue, revenue_type='messages')
total_revenue_fn = partial(revenue, revenue_type='total')


In [None]:
df_revenue_periodic['calls_revenue'] = df_revenue_periodic.apply(calls_revenue_fn, axis=1)
df_revenue_periodic['messages_revenue'] = df_revenue_periodic.apply(messages_revenue_fn, axis=1)
df_revenue_periodic['traffic_revenue'] = df_revenue_periodic.apply(traffic_revenue_fn, axis=1)
df_revenue_periodic['total_revenue'] = df_revenue_periodic.apply(total_revenue_fn, axis=1)


In [None]:
df_revenue_monthly = (
    df_revenue_periodic.groupby('month', observed=False)[
        ['calls_revenue', 'messages_revenue', 'traffic_revenue', 'total_revenue']
    ]
    .sum()
    .reset_index()
)

df_revenue_monthly

In [None]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "periodic" / "revenue_monthly.csv"

df_revenue_monthly.to_csv(processed_path, index=False)