In [16]:
import os, sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandasql import sqldf
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D

rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

import scripts.read_data_from_db as rd
import scripts.data_cleaning as dc
import scripts.utils as util
from sklearn.metrics import euclidean_distances

In [2]:
experience_df = rd.read_data(table_name='user_experience')
engagement_df = rd.read_data(table_name='user_engagement')

INFO:scripts.read_data_from_db:Data fetched succesfully
INFO:scripts.read_data_from_db:Data fetched succesfully


In [70]:
pysqldf = lambda q: sqldf(q, globals())

In [3]:
experience_df.head()

Unnamed: 0,CustomerID,AvgTCP,AvgRTT,AvgThroughput
0,33601000000.0,1.0,0.163896,0.000268
1,33601000000.0,1.0,0.028926,0.000349
2,33601000000.0,1.0,0.427515,0.000342
3,33601010000.0,1.0,0.247934,0.000873
4,33601010000.0,0.717138,0.183188,0.100102


In [4]:
engagement_df.head()

Unnamed: 0,MSISDN/Number,SessionDuration,SessionFrequency,TotalTraffic
0,33601000000.0,0.001349,0.0,0.001568
1,33601000000.0,0.002143,0.0,0.000229
2,33601000000.0,0.001574,0.0,0.001044
3,33601010000.0,0.000526,0.0,0.000722
4,33601010000.0,0.000369,0.001927,0.002641


In [50]:
merged_df = engagement_df.merge(experience_df, left_on='MSISDN/Number', right_on="CustomerID", how='inner')
merged_df.head()

Unnamed: 0,MSISDN/Number,SessionDuration,SessionFrequency,TotalTraffic,EngagementScore,CustomerID,AvgTCP,AvgRTT,AvgThroughput
0,33601000000.0,0.001349,0.0,0.001568,0.002365,33601000000.0,1.0,0.163896,0.000268
1,33601000000.0,0.002143,0.0,0.000229,0.002696,33601000000.0,1.0,0.028926,0.000349
2,33601000000.0,0.001574,0.0,0.001044,0.002399,33601000000.0,1.0,0.427515,0.000342
3,33601010000.0,0.000526,0.0,0.000722,0.002912,33601010000.0,1.0,0.247934,0.000873
4,33601010000.0,0.000369,0.001927,0.002641,0.001972,33601010000.0,0.717138,0.183188,0.100102


### Handle Outliers

In [5]:
experience_columns = ["AvgTCP","AvgRTT","AvgThroughput"]
engagement_columns = ["SessionDuration","SessionFrequency","TotalTraffic"]

In [51]:
merged_df = dc.handle_outliers(merged_df, engagement_columns)
merged_df = dc.handle_outliers(merged_df, experience_columns)

### Assigning scores

In [65]:
k = 3
X_eng = merged_df[engagement_columns]
X_exp = merged_df[experience_columns]
df = pd.DataFrame({})

##### Assign engagement score

In [66]:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_eng)

less_engaged_cluster = pd.Series(cluster_labels).value_counts().idxmin()
less_engaged_cluster_center = kmeans.cluster_centers_[less_engaged_cluster]

distances = euclidean_distances(X_eng, [less_engaged_cluster_center]).flatten()
df['UserID'] = merged_df['MSISDN/Number']
df['EngagementScore'] =  distances
df.head()


Unnamed: 0,UserID,EngagementScore
0,33601000000.0,0.001966
1,33601000000.0,0.001997
2,33601000000.0,0.001974
3,33601010000.0,0.002381
4,33601010000.0,0.001386


##### Assign Experience score

In [67]:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_exp)

worst_experience_cluster = pd.Series(cluster_labels).value_counts().idxmin()
worst_experience_cluster_cluster_center = kmeans.cluster_centers_[worst_experience_cluster]

distances = euclidean_distances(X_eng, [worst_experience_cluster_cluster_center]).flatten()
df['ExperienceScore'] =  distances
df.head()

Unnamed: 0,UserID,EngagementScore,ExperienceScore
0,33601000000.0,0.001966,0.283794
1,33601000000.0,0.001997,0.283513
2,33601000000.0,0.001974,0.283802
3,33601010000.0,0.002381,0.284367
4,33601010000.0,0.001386,0.282708


#### Satisfaction Score

In [69]:
df['SatisfactionScore'] = (df['EngagementScore'] + df['ExperienceScore']) / 2
df.head()

Unnamed: 0,UserID,EngagementScore,ExperienceScore,SatisfactionScore
0,33601000000.0,0.001966,0.283794,0.14288
1,33601000000.0,0.001997,0.283513,0.142755
2,33601000000.0,0.001974,0.283802,0.142888
3,33601010000.0,0.002381,0.284367,0.143374
4,33601010000.0,0.001386,0.282708,0.142047


### Top satisfied users

In [72]:
query = ''' 
    SELECT 
        UserID,
        SatisfactionScore
    FROM df
    ORDER BY SatisfactionScore DESC
    LIMIT 10
'''

pysqldf(query)

Unnamed: 0,UserID,SatisfactionScore
0,33699040000.0,0.143621
1,33664840000.0,0.14362
2,33763110000.0,0.14362
3,33664900000.0,0.14362
4,33658520000.0,0.143619
5,33632140000.0,0.143619
6,33626710000.0,0.143619
7,33630830000.0,0.143619
8,33698210000.0,0.143618
9,33611990000.0,0.143617
