# Import required modules 

In [29]:
import pickle
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Read Data

In [30]:
# get user engagement data
tellco_engagement_df = pd.read_csv("../data/user_engagement_data.csv")
tellco_engagement_df.head(5)

Unnamed: 0,MSISDN/Number,Cluster,number of xDR Sessions,Dur (ms),Total Data Volume (Bytes)
0,33601008617,0,1,18555323.0,871832580.0
1,33601011634,1,1,64180392.0,199050991.0
2,33601021217,0,1,38416201.0,630092434.0
3,33601031129,0,1,55730653.0,637053075.0
4,33601034530,0,1,56313873.0,775900974.0


In [31]:
# get user exprience data
tellco_experience_df = pd.read_csv("../data/user_experience_data.csv")
tellco_experience_df.head(5)

Unnamed: 0,MSISDN/Number,Cluster,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes)
0,33601008617,1,91.0,52995.0,9370832.0
1,33601011634,0,39.0,42416.0,110232.0
2,33601021217,1,160.0,19256.0,13171894.0
3,33601031129,0,60.0,38190.0,2325497.0
4,33601034530,2,217.5,8539.0,2006261.0


In [32]:
with open("../models/user_engagement.pkl", "rb") as f:
    kmeans1 = pickle.load(f)

In [33]:
with open("../models/user_experience.pkl", "rb") as f:
    kmeans2 = pickle.load(f)

# K-means Clustering

In [34]:
# choose relevant columns and scale the data
engagement_df = tellco_engagement_df.set_index('MSISDN/Number')[['number of xDR Sessions', 'Dur (ms)', 'Total Data Volume (Bytes)']]
scaler = StandardScaler()
scale_data = scaler.fit_transform(engagement_df)
scale_data

array([[-0.35857882, -0.82603254,  0.51950116],
       [-0.35857882, -0.31341552, -0.92897736],
       [-0.35857882, -0.60288714, -0.0009581 ],
       ...,
       [-0.35857882, -0.62090252, -0.94003019],
       [-0.35857882, -0.67650075, -0.55736473],
       [-0.35857882, -0.93551765,  0.1810738 ]])

In [35]:
normalized_data = normalize(scale_data)
normalized_data

array([[-0.3449165 , -0.79455964,  0.4997075 ],
       [-0.34348663, -0.30022421, -0.88987772],
       [-0.5111859 , -0.85946908, -0.00136586],
       ...,
       [-0.30329764, -0.52517957, -0.79510814],
       [-0.37863055, -0.71433068, -0.58853257],
       [-0.35219846, -0.91887156,  0.17785187]])

In [36]:
less_engaged_cluster = 3
distance = kmeans1.fit_transform(normalized_data)
distance_from_less_engagement = list(
    map(lambda x: x[less_engaged_cluster], distance))
tellco_engagement_df['engagement_score'] = distance_from_less_engagement
tellco_engagement_df.head(5)

Unnamed: 0,MSISDN/Number,Cluster,number of xDR Sessions,Dur (ms),Total Data Volume (Bytes),engagement_score
0,33601008617,0,1,18555323.0,871832580.0,0.73762
1,33601011634,1,1,64180392.0,199050991.0,1.587919
2,33601021217,0,1,38416201.0,630092434.0,0.973625
3,33601031129,0,1,55730653.0,637053075.0,0.861556
4,33601034530,0,1,56313873.0,775900974.0,0.519427


In [37]:
experience_df = tellco_experience_df.set_index('MSISDN/Number')[['Total Avg RTT (ms)', 'Total Avg Bearer TP (kbps)', 'Total TCP Retrans. Vol (Bytes)']]
scaler = StandardScaler()
scale_data = scaler.fit_transform(experience_df)
scale_data

array([[-0.37139792,  0.09535923,  0.59768534],
       [-0.78096612, -0.11760067, -0.56884   ],
       [ 0.17206758, -0.58382161,  1.07649181],
       ...,
       [-0.67857407,  1.24364974, -0.22951498],
       [-0.54467678,  1.12858416, -0.58145378],
       [-0.67857407, -0.76280134, -0.55525918]])

In [38]:
normalized_data = normalize(scale_data)
normalized_data

array([[-0.52301393,  0.13428779,  0.84167881],
       [-0.80238821, -0.12082648, -0.58444342],
       [ 0.1391407 , -0.47210142,  0.87049418],
       ...,
       [-0.47280739,  0.86653294, -0.15991825],
       [-0.3942729 ,  0.8169435 , -0.42089452],
       [-0.58388488, -0.65635896, -0.47777752]])

In [39]:
worst_experience_cluster = 1
distance = kmeans2.fit_transform(normalized_data)
distance_from_worst_experience_cluster = list(
    map(lambda x: x[worst_experience_cluster], distance))
tellco_experience_df['experience_score'] = distance_from_worst_experience_cluster
tellco_experience_df.head()

Unnamed: 0,MSISDN/Number,Cluster,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes),experience_score
0,33601008617,1,91.0,52995.0,9370832.0,0.737084
1,33601011634,0,39.0,42416.0,110232.0,1.515212
2,33601021217,1,160.0,19256.0,13171894.0,0.843273
3,33601031129,0,60.0,38190.0,2325497.0,1.490895
4,33601034530,2,217.5,8539.0,2006261.0,1.424646


In [40]:
user_satisfaction_df = pd.merge(tellco_engagement_df, tellco_experience_df, on='MSISDN/Number')
user_satisfaction_df['satisfaction_score'] = (
    user_satisfaction_df['engagement_score'] + user_satisfaction_df['experience_score'])/2
user_satisfaction_df.head()

Unnamed: 0,MSISDN/Number,Cluster_x,number of xDR Sessions,Dur (ms),Total Data Volume (Bytes),engagement_score,Cluster_y,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes),experience_score,satisfaction_score
0,33601008617,0,1,18555323.0,871832580.0,0.73762,1,91.0,52995.0,9370832.0,0.737084,0.737352
1,33601011634,1,1,64180392.0,199050991.0,1.587919,0,39.0,42416.0,110232.0,1.515212,1.551566
2,33601021217,0,1,38416201.0,630092434.0,0.973625,1,160.0,19256.0,13171894.0,0.843273,0.908449
3,33601031129,0,1,55730653.0,637053075.0,0.861556,0,60.0,38190.0,2325497.0,1.490895,1.176226
4,33601034530,0,1,56313873.0,775900974.0,0.519427,2,217.5,8539.0,2006261.0,1.424646,0.972036


In [41]:
user_satisfaction_df = user_satisfaction_df[['MSISDN/Number', 'engagement_score',
                        'experience_score', 'satisfaction_score']]
user_satisfaction_df.set_index('MSISDN/Number', inplace=True)
user_satisfaction_df.head()

Unnamed: 0_level_0,engagement_score,experience_score,satisfaction_score
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33601008617,0.73762,0.737084,0.737352
33601011634,1.587919,1.515212,1.551566
33601021217,0.973625,0.843273,0.908449
33601031129,0.861556,1.490895,1.176226
33601034530,0.519427,1.424646,0.972036


In [42]:
user_satisfaction_df.sort_values('satisfaction_score', ascending=False).head(10)

Unnamed: 0_level_0,engagement_score,experience_score,satisfaction_score
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33650049821,1.959929,1.608358,1.784144
33646061230,1.933759,1.593327,1.763543
33611485547,1.894206,1.623605,1.758906
33645246256,1.903445,1.60058,1.752012
33612805125,1.911776,1.575788,1.743782
33648410823,1.890987,1.591575,1.741281
33661631023,1.887045,1.595408,1.741227
33650925754,1.879797,1.598225,1.739011
33650612274,1.846202,1.622923,1.734562
33669462927,1.899341,1.54314,1.721241


In [43]:
X = user_satisfaction_df[['engagement_score', 'experience_score']]
y = user_satisfaction_df[['satisfaction_score']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [44]:
y_pred = model.predict(X_test)

In [45]:
user_satisfaction_df1 = user_satisfaction_df[['engagement_score', 'experience_score']]
user_satisfaction_df1

Unnamed: 0_level_0,engagement_score,experience_score
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1
33601008617,0.737620,0.737084
33601011634,1.587919,1.515212
33601021217,0.973625,0.843273
33601031129,0.861556,1.490895
33601034530,0.519427,1.424646
...,...,...
33789914536,1.505793,1.502046
33789922012,1.039937,1.612655
33789942399,1.542885,1.077884
33789980299,1.394739,1.204802


In [46]:
scaler = StandardScaler()
scale_data = scaler.fit_transform(user_satisfaction_df1)
scale_data

array([[-0.92640394, -1.47149362],
       [ 0.8804897 ,  0.69341672],
       [-0.4248911 , -1.17605372],
       ...,
       [ 0.78479212, -0.52331731],
       [ 0.4699793 , -0.17020837],
       [-0.46187173,  0.83226226]])

In [47]:
normalized_data = normalize(scale_data)
normalized_data

array([[-0.53277536, -0.84625671],
       [ 0.78562283,  0.61870572],
       [-0.33978951, -0.94050151],
       ...,
       [ 0.83199032, -0.55479015],
       [ 0.94023802, -0.34051794],
       [-0.48524452,  0.8743785 ]])

In [48]:
kmeans = KMeans(n_clusters = 3, random_state = 42)
y_kmeans = kmeans.fit_predict(normalized_data)
X = np.array(normalized_data)
y_kmeans

array([2, 1, 2, ..., 2, 2, 0])

In [49]:
clustered_tellco_satisfaction_df = user_satisfaction_df.copy()
clustered_tellco_satisfaction_df.insert(0, 'Cluster', y_kmeans)
clustered_tellco_satisfaction_df

Unnamed: 0_level_0,Cluster,engagement_score,experience_score,satisfaction_score
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
33601008617,2,0.737620,0.737084,0.737352
33601011634,1,1.587919,1.515212,1.551566
33601021217,2,0.973625,0.843273,0.908449
33601031129,0,0.861556,1.490895,1.176226
33601034530,0,0.519427,1.424646,0.972036
...,...,...,...,...
33789914536,1,1.505793,1.502046,1.503919
33789922012,0,1.039937,1.612655,1.326296
33789942399,2,1.542885,1.077884,1.310385
33789980299,2,1.394739,1.204802,1.299770


In [55]:
clustered_tellco_satisfaction_df.groupby('Cluster').agg(
    {'satisfaction_score': 'mean', 'experience_score': 'mean'})

Unnamed: 0_level_0,satisfaction_score,experience_score
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.00325,1.402999
1,1.480483,1.499866
2,1.115258,0.804499


In [56]:
engine = create_engine('mysql+pymysql://root:@localhost/tellco')

In [57]:
# writing to database
try:
    print('writing to the database')
    frame = user_satisfaction_df.to_sql(
        "tellco_analysis", con=engine, if_exists='replace')
except Exception as e:
  print("Error writing to database: ", e)

# reading from database
pd.read_sql("select * from tellco.tellco_analysis", engine)

writing to the database


Unnamed: 0,MSISDN/Number,engagement_score,experience_score,satisfaction_score
0,33601008617,0.737620,0.737084,0.737352
1,33601011634,1.587919,1.515212,1.551566
2,33601021217,0.973625,0.843273,0.908449
3,33601031129,0.861556,1.490895,1.176226
4,33601034530,0.519427,1.424646,0.972036
...,...,...,...,...
38906,33789914536,1.505793,1.502046,1.503919
38907,33789922012,1.039937,1.612655,1.326296
38908,33789942399,1.542885,1.077884,1.310385
38909,33789980299,1.394739,1.204802,1.299770


In [58]:
user_satisfaction_df.to_csv('../data/user_satisfaction_data.csv')