In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score

import matplotlib as mlp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn; sns.set(rc={'figure.figsize':(12,10)})
# import tensorflow as tf
# import keras

pd.set_option('display.max_columns', 500)

# Data-loading

In [None]:
subscriptions_rds = pd.read_csv("subscriptions_mix.csv")
subscriptions_dtypes_modified = False

In [None]:
transactions_rds = pd.read_csv("transactions_mix.csv")

In [None]:
subscriptions_rds

# Subscription

In [None]:
if not subscriptions_dtypes_modified:

    subscriptions_rds.dropna(how='all', inplace=True)

    subscriptions_rds = subscriptions_rds.loc[(subscriptions_rds['revenue'].astype(str).str.isnumeric()) & (subscriptions_rds['service_price'].astype(str).str.isnumeric())]

    subscriptions_rds['country'] = subscriptions_rds['country'].str.upper()
    subscriptions_rds['source'] = subscriptions_rds['source'].str.upper()
    subscriptions_rds['status'] = subscriptions_rds['status'].astype(int)
    subscriptions_rds['revenue'] = subscriptions_rds['revenue'].astype(float)
    subscriptions_rds['service_price'] = subscriptions_rds['service_price'].astype(float).astype(int)
    subscriptions_rds['currency'] = subscriptions_rds['currency'].fillna('XOF').str.upper()
    subscriptions_rds['attempt_charging'] = subscriptions_rds['attempt_charging'].astype(int)
    subscriptions_rds['success_billing'] = subscriptions_rds['success_billing'].astype(int)


    column_to_datetime = ['created_at', 'subs_date', 'renewal_date', 'freemium_end_date', 'unsubs_date']

    for column in column_to_datetime:
        subscriptions_rds[column] = pd.to_datetime(subscriptions_rds[column], errors='coerce')

    subscriptions_dtypes_modified = True

subscriptions_rds.columns

In [None]:
def ToIDRMultiplier(str):
    if str == 'BATH': return 440
    if str == 'RIAL' : return 3900
    if str == 'XOF' : return 23
    return 0.8

def ToLocalHour(str):
    if str == 'TH' or str == 'LA' : return 0
    if str == 'SN' : return -7
    return -3
    

# Data processing

In [None]:
subscriptions_mds = subscriptions_rds.drop(columns = 
    ['id',
     'created_at',
     'trxid', 
     'adnet',
     'browser',
     'handset',
     'pixel',
     'publisher',
     'service',
     'profile_status',
     'unsubs_from',
     'pixel',
     'browser',
     'status'])

drop_row = ((subscriptions_mds['cycle'] == '1') |
            (subscriptions_mds['subs_date'] > subscriptions_mds['unsubs_date']) |
            (subscriptions_mds['attempt_charging'] < subscriptions_mds['success_billing']))

subscriptions_mds.drop(drop_row[drop_row].index, inplace=True)

subscriptions_mds['subs_date_month'] = subscriptions_mds['subs_date'].dt.month

subscriptions_mds['subs_date_hour'] = subscriptions_mds['subs_date'].dt.hour
subscriptions_mds['subs_date_hour_local'] = (subscriptions_mds['subs_date_hour'] + subscriptions_mds['country'].apply(ToLocalHour)).apply(lambda x : x+24 if x < 0 else x)

subscriptions_mds['delta_date'] = subscriptions_mds[subscriptions_mds['unsubs_date'].notna()]['unsubs_date'] \
                                    - subscriptions_mds[subscriptions_mds['unsubs_date'].notna()]['subs_date']

subscriptions_mds['revenue_converted'] = subscriptions_mds['currency'].apply(ToIDRMultiplier) * subscriptions_mds['revenue']
subscriptions_mds['service_price_converted'] = subscriptions_mds['currency'].apply(ToIDRMultiplier) * subscriptions_mds['service_price']

subscriptions_mds['success_billing_rate'] = (subscriptions_mds['success_billing'] / subscriptions_mds['attempt_charging']).fillna(0)

subscriptions_mds['years_subscribed'] = (subscriptions_mds['unsubs_date'].fillna(pd.Timestamp(year=2023, month=10, day=1)) - subscriptions_mds['subs_date']) / pd.Timedelta(days=365)

status_index = subscriptions_mds['unsubs_date'].notna().astype(str).replace('False', None).replace('True', '-1')
status_index[status_index.isna()] = subscriptions_mds.loc[status_index.isna()]['renewal_date'].notna().astype(str).replace('False', 0).replace('True', '1')
subscriptions_mds['status'] = status_index.astype(int)

encoder = LabelEncoder()

subscriptions_mds['source_encoded'] = encoder.fit_transform(subscriptions_mds['source'])
subscriptions_mds['country_encoded'] = encoder.fit_transform(subscriptions_mds['country'])
subscriptions_mds['operator_encoded'] = encoder.fit_transform(subscriptions_mds['operator'])
subscriptions_mds['cycle_encoded'] = encoder.fit_transform(subscriptions_mds['cycle'])

subscriptions_mds.columns

In [None]:
(subscriptions_mds[subscriptions_mds['delta_date'].notna()]['delta_date'].dt.total_seconds()/60/60/24).describe()

In [None]:
sns.heatmap(subscriptions_mds.corr())

# Learning Model

## Random Forest Regressor

In [None]:
rmean_sqr_err = []
mean_abs_err = []
for i in range(10):
    columns_to_encode = ['country', 'source', 'cycle']

    data_sample = subscriptions_mds[subscriptions_mds['unsubs_date'].notna()]

    encoding_df = pd.DataFrame(index=data_sample.index)
    for column in columns_to_encode:
        encoding_df = pd.concat([encoding_df,
                                pd.get_dummies(data_sample[column].astype(pd.CategoricalDtype(categories=subscriptions_mds[column].unique())))],
                                axis=1)

    X = data_sample[['revenue_converted', 'service_price_converted']]

    X = pd.concat([encoding_df, X], axis=1)
    X.columns = X.columns.astype(str)

    y = data_sample['delta_date'].dt.total_seconds()/60/60/24

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    rfr = RandomForestRegressor(max_depth=10)
    rfr.fit(X_train, y_train)

    y_pred = rfr.predict(X_test)

    rmean_sqr_err.append(mean_squared_error(y_pred, y_test, squared=False))
    mean_abs_err.append(mean_absolute_error(y_pred, y_test))
print(f'RMSE: {sum(rmean_sqr_err)/10}')
print(f'MAE: {sum(mean_abs_err)/10}')

In [None]:
feature_importances = rfr.feature_importances_
feature_names = X.columns.tolist()
feature_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances}).sort_values('importance')
print(feature_df)

# Feature importance horizantal bar plot
feature_df.plot(x='feature', y='importance', kind='barh', color='blue', edgecolor='black')
plt.title('Feature Importance')
plt.ylabel('Feature')
plt.show()

## ADABoost

In [None]:
rmean_sqr_err = []
mean_abs_err = []
for i in range(10):
    columns_to_encode = ['country', 'source', 'cycle']

    data_sample = subscriptions_mds[subscriptions_mds['unsubs_date'].notna()]

    encoding_df = pd.DataFrame(index=data_sample.index)
    for column in columns_to_encode:
        encoding_df = pd.concat([encoding_df,
                                pd.get_dummies(data_sample[column].astype(pd.CategoricalDtype(categories=subscriptions_mds[column].unique())))],
                                axis=1)

    X = data_sample[['revenue_converted', 'service_price_converted']]

    X = pd.concat([encoding_df, X], axis=1)
    X.columns = X.columns.astype(str)

    y = data_sample['delta_date'].dt.total_seconds()/60/60/24

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    ada = AdaBoostRegressor(RandomForestRegressor(max_depth=10))
    ada.fit(X_train, y_train)

    y_pred = ada.predict(X_test)

    rmean_sqr_err.append(mean_squared_error(y_pred, y_test, squared=False))
    mean_abs_err.append(mean_absolute_error(y_pred, y_test))
print(f'RMSE: {sum(rmean_sqr_err)/10}')
print(f'MAE: {sum(mean_abs_err)/10}')

In [None]:
feature_importances = ada.feature_importances_
feature_names = X.columns.tolist()
feature_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances}).sort_values('importance')
print(feature_df)

# Feature importance horizantal bar plot
feature_df.plot(x='feature', y='importance', kind='barh', color='blue', edgecolor='black')
plt.title('Feature Importance')
plt.ylabel('Feature')
plt.show()

## Deep Learning

In [None]:
columns_to_encode = ['country', 'source', 'cycle']

data_sample = subscriptions_mds[subscriptions_mds['unsubs_date'].notna()]

encoding_df = pd.DataFrame(index=data_sample.index)
for column in columns_to_encode:
    encoding_df = pd.concat([encoding_df,
                             pd.get_dummies(data_sample[column].astype(pd.CategoricalDtype(categories=subscriptions_mds[column].unique())))],
                            axis=1)

X = data_sample[['revenue_converted', 'service_price_converted']]

X = pd.concat([encoding_df, X], axis=1)

y = data_sample['delta_date'].dt.total_seconds()/60/60/24
# y = StandardScaler().fit_transform(y.to_numpy().reshape(-1, 1))
# y = y.ravel()
numeric_features = X.astype(int)

normalizer = keras.layers.Normalization()
normalizer.adapt(numeric_features)

model = keras.Sequential([
normalizer,
keras.layers.Dense(100, activation='sigmoid'),
keras.layers.Dense(100, activation='sigmoid'),
keras.layers.Dense(50, activation='sigmoid'),
keras.layers.Dense(50, activation='sigmoid'),
keras.layers.Dense(50, activation='sigmoid'),
keras.layers.Dense(10, activation='relu'),
keras.layers.Dense(1)
])

model.compile(optimizer='rmsprop',
            loss=keras.losses.MeanSquaredError(),
            metrics=[keras.metrics.RootMeanSquaredError(), keras.metrics.MeanAbsoluteError()])

numeric_dataset = tf.data.Dataset.from_tensor_slices((numeric_features, y))
numeric_batches = numeric_dataset.shuffle(7000).batch(10)
model.fit(numeric_batches, epochs=10)

In [None]:
numeric_dataset

## Decision Tree Regressor

In [None]:
rmean_sqr_err = []
mean_abs_err = []
for i in range(10):
    columns_to_encode = ['country', 'source', 'cycle']

    data_sample = subscriptions_mds[subscriptions_mds['unsubs_date'].notna()]

    encoding_df = pd.DataFrame(index=data_sample.index)
    for column in columns_to_encode:
        encoding_df = pd.concat([encoding_df,
                                pd.get_dummies(data_sample[column].astype(pd.CategoricalDtype(categories=subscriptions_mds[column].unique())))],
                                axis=1)

    X = data_sample[['revenue_converted', 'service_price_converted']]

    X = pd.concat([encoding_df, X], axis=1)
    X.columns = X.columns.astype(str)

    y = data_sample['delta_date'].dt.total_seconds()/60/60/24

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    dtr = DecisionTreeRegressor(max_depth=15)
    dtr.fit(X_train, y_train)

    y_pred = dtr.predict(X_test)

    rmean_sqr_err.append(mean_squared_error(y_pred, y_test, squared=False))
    mean_abs_err.append(mean_absolute_error(y_pred, y_test))
print(f'RMSE: {sum(rmean_sqr_err)/10}')
print(f'MAE: {sum(mean_abs_err)/10}')

In [None]:
feature_importances = dtr.feature_importances_
feature_names = X.columns.tolist()
feature_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances}).sort_values('importance')
print(feature_df)

# Feature importance horizantal bar plot
feature_df.plot(x='feature', y='importance', kind='barh', color='blue', edgecolor='black')
plt.title('Feature Importance')
plt.ylabel('Feature')
plt.show()

## SVR

In [None]:
rmean_sqr_err = []
mean_abs_err = []
for i in range(10):
    columns_to_encode = ['country', 'source', 'cycle']

    data_sample = subscriptions_mds[subscriptions_mds['unsubs_date'].notna()]

    encoding_df = pd.DataFrame(index=data_sample.index)
    for column in columns_to_encode:
        encoding_df = pd.concat([encoding_df,
                                pd.get_dummies(data_sample[column].astype(pd.CategoricalDtype(categories=subscriptions_mds[column].unique())))],
                                axis=1)

    X = data_sample[['revenue_converted', 'service_price_converted']]

    X = pd.concat([encoding_df, X], axis=1)
    X.columns = X.columns.astype(str)

    y = data_sample['delta_date'].dt.total_seconds()/60/60/24

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    svr = SVR()
    svr.fit(X_train, y_train)

    y_pred = svr.predict(X_test)

    rmean_sqr_err.append(mean_squared_error(y_pred, y_test, squared=False))
    mean_abs_err.append(mean_absolute_error(y_pred, y_test))
print(f'RMSE: {sum(rmean_sqr_err)/10}')
print(f'MAE: {sum(mean_abs_err)/10}')

In [None]:
feature_importances = dtr.feature_importances_
feature_names = X.columns.tolist()
feature_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances}).sort_values('importance')
print(feature_df)

# Feature importance horizantal bar plot
feature_df.plot(x='feature', y='importance', kind='barh', color='blue', edgecolor='black')
plt.title('Feature Importance')
plt.ylabel('Feature')
plt.show()

# Visualization

In [None]:
year = 2020
month = 5
subscriptions_mds[(subscriptions_mds['subs_date'] >= pd.Timestamp(f'{year}-{month+1}')) &
                        (subscriptions_mds['subs_date'] < pd.Timestamp((f'{year + (1 if month + 1 == 12 else 0)}-{(month+1)%12 + 1}'))) & 
                        ((subscriptions_mds['unsubs_date'].isnull()) | (subscriptions_mds['unsubs_date'] >= pd.Timestamp(f'{year + (1 if month + 1 == 12 else 0)}-{(month+1)%12 + 1}')))]

In [None]:
active_user_count_ds = []
for year in range(2020, 2024):
    for month in range(12):
        user_count = subscriptions_mds[(subscriptions_mds['subs_date'] >= pd.Timestamp(year=year, month=month+1, day=1)) &
                        (subscriptions_mds['subs_date'] < pd.Timestamp(year=year+(1 if month + 1 == 12 else 0), month=(month+1)%12 + 1, day=1)) & 
                        ((subscriptions_mds['unsubs_date'].isnull()) | (subscriptions_mds['unsubs_date'] >= pd.Timestamp(year=year + (1 if month + 1 == 12 else 0), month=(month+1)%12 + 1, day=1)))]['msisdn'].unique().size
        active_user_count_ds.append([year, month+1, user_count])

active_user_count_ds = active_user_count_ds[2:45]
        
active_user_count_ds = pd.DataFrame(data=active_user_count_ds, columns=['year', 'month', 'active_user_count'])
active_user_count_ds['time'] = active_user_count_ds['year'].astype(str) + '-' + active_user_count_ds['month'].astype(str)

In [None]:
active_user_count_ds_h = []
for year in range(2020, 2024):
    for month in range(12):
        for hour in range(24):
            user_count = subscriptions_mds[(subscriptions_mds['subs_date'] >= pd.Timestamp(year=year, month=month+1, day=1)) &
                            (subscriptions_mds['subs_date'] < pd.Timestamp(year=year+(1 if month + 1 == 12 else 0), month=(month+1)%12 + 1, day=1)) & 
                            ((subscriptions_mds['unsubs_date'].isnull()) | (subscriptions_mds['unsubs_date'] >= pd.Timestamp(year=year + (1 if month + 1 == 12 else 0), month=(month+1)%12 + 1, day=1))) &
                            (subscriptions_mds['subs_date'].dt.hour == hour)]['msisdn'].unique().size
            active_user_count_ds_h.append([year, month+1, hour, user_count])
        
active_user_count_ds_h = pd.DataFrame(data=active_user_count_ds_h, columns=['year', 'month', 'month', 'active_user_count'])

In [None]:
year=2021
month=10
subscriptions_mds[(subscriptions_mds['subs_date'] >= pd.Timestamp(year=year, month=month+1, day=1)) &
                        (subscriptions_mds['subs_date'] < pd.Timestamp(year=year+(1 if month + 1 == 12 else 0), month=(month+1)%12 + 1, day=1)) & 
                        ((subscriptions_mds['unsubs_date'].isnull()) | (subscriptions_mds['unsubs_date'] >= pd.Timestamp(year=year + (1 if month + 1 == 12 else 0), month=(month+1)%12 + 1, day=1)))]

In [None]:
sns.barplot(x = 'active_user_count', y = 'time', data=active_user_count_ds, orient='h', estimator="sum", errorbar=None)

In [None]:
subscriptions_mds['subs_year'] = subscriptions_mds['subs_date'].dt.year
sns.barplot(x='subs_year',y='revenue',data=subscriptions_mds[['subs_year', 'revenue']].groupby('subs_year').mean().reset_index())
plt.show()

In [None]:
subscriptions_mds['subs_month'] = subscriptions_mds['subs_date'].dt.month
sns.barplot(x='subs_month',y='revenue',data=subscriptions_mds[['subs_month', 'revenue']].groupby('subs_month').mean().reset_index())
plt.show()

In [None]:
subscriptions_mds

In [None]:
subscriptions_mds['subs_date_year'] = subscriptions_mds['subs_date'].dt.year
subscriptions_mds['subs_date_month'] = subscriptions_mds['subs_date'].dt.month
subscriptions_mds['subs_date_day'] = subscriptions_mds['subs_date'].dt.day

In [None]:
sns.boxplot(x='subs_date_month',y='success_billing',data=subscriptions_mds[['subs_date_month', 'success_billing']])
plt.title('Succesful billing per month')
plt.xlabel('Month')
plt.ylabel('Succesful Billing')
plt.show()

In [None]:
# sns.countplot(x='years_subscribed',data=subscriptions_mds.drop(subscriptions_mds[subscriptions_mds['years_subscribed'] == -1].index))
# plt.title('Duration of subscription')
# plt.xlabel('Years subscribed')

# plt.show()

In [None]:

sns.barplot(x='country',y='revenue',data=subscriptions_mds)

plt.title('Revenue generated from each countries')

plt.show()

# Transaction

In [None]:
transactions_rds.columns

In [None]:
transactions_rds[transactions_rds['sms_content'].notnull()][['sms_content']]

In [None]:

transactions_rds[transactions_rds['telco_api_url'].notnull()][['telco_api_url']]

In [None]:
transactions_mds = transactions_rds.drop(columns = ['currency', 'browser', 'handset','sms_content'])

In [None]:
transactions_mds['event_date'] = pd.to_datetime(transactions_mds ['event_date'], errors='coerce')
transactions_mds['charge_date'] = pd.to_datetime(transactions_mds['charge_date'], errors='coerce')
transactions_mds['created_at'] = pd.to_datetime(transactions_mds['created_at'], errors='coerce')
transactions_mds['id'] = transactions_mds['id'].astype(int)
transactions_mds['status_sms'] = transactions_mds['status_sms'] == 1

transactions_mds['revenue'] = pd.to_numeric(transactions_mds['revenue'], errors='coerce', downcast='integer')
transactions_mds.dropna(subset=['revenue'], inplace=True)
transactions_mds = transactions_mds.astype({'revenue' : 'int32'})

In [None]:
transactions_mds.dtypes

In [None]:
transactions_rds.info()
transactions_rds[transactions_rds['sms_content'].notnull()]

In [None]:
print(transactions_rds.columns)

In [None]:
print(transactions_mds.columns)


In [None]:
transactions_mds['country'].unique()

In [None]:
transactions_mds['charge_month'] = transactions_mds['charge_date'].dt.month
sns.lineplot(x='charge_month',y='revenue',data=transactions_mds[['charge_month', 'revenue']].groupby('charge_month').mean().reset_index())
plt.show()

In [None]:
sns.countplot(x='country',data=transactions_mds)
plt.title('Numbers of user for each country')
plt.show()


In [None]:
sns.countplot(x='operator',data=transactions_mds)
plt.title('Numbers of user for each operator')
plt.show()

In [None]:
sns.barplot(x='country',y='revenue',data=transactions_mds[['country', 'revenue']].groupby('country').mean().reset_index())
plt.show()

In [None]:

transactions_mds.boxplot(by ='country', column =['revenue'], grid = False) 

In [None]:
sns.barplot(x='operator',y='revenue',data=transactions_mds)
plt.show()