<h1>User Satisfaction analysis</h1>

In [2]:
# Importing the neccesary libraries and packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append("..")
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [4]:
# Reading the data

pd.set_option('display.float_format', lambda x: '%.1f' % x)
data = pd.read_csv('../data/user_experience.csv')
data.head()

Unnamed: 0,Avg_TCP_transmition,Avg_RTT,Avg_TP,MSISDN/Number
0,641157423284.7,3033113.2,832626680.0,19727
1,191475493090.4,735741.7,73815132.0,9395
2,186582302387.6,1043816.5,89018073.0,8904
3,171563885706.9,917579.4,68145840.0,8993
4,112152550871.2,702387.9,76077278.0,6279


In [5]:
# Finiding out the metric features and normalizing them

features = ['Avg_TCP_transmition','Avg_RTT','Avg_TP']
features_array = data.loc[:,features].values
features_array = StandardScaler().fit_transform(features_array)
features_array

array([[30.55762509, 29.54037565, 35.93237887],
       [ 9.04820187,  7.06765394,  3.12217528],
       [ 8.81414819, 10.08121916,  3.77953426],
       ...,
       [-0.11055483, -0.11655312, -0.06950519],
       [-0.11055484, -0.12826992, -0.06951224],
       [-0.11055487, -0.12788059, -0.06496324]])

In [6]:
# Making sure if the data is normalized well or not
# Normalized data has a mean of 0 and std of 1

np.mean(features_array), np.std(features_array), features_array.shape

(-1.0216286639253777e-17, 0.9999999999999999, (1391, 3))

In [7]:
# Changing my array into dataframe

normalized_data = pd.DataFrame(features_array,columns=features)
normalized_data

Unnamed: 0,Avg_TCP_transmition,Avg_RTT,Avg_TP
0,30.6,29.5,35.9
1,9.0,7.1,3.1
2,8.8,10.1,3.8
3,8.1,8.8,2.9
4,5.3,6.7,3.2
...,...,...,...
1386,-0.1,-0.1,-0.1
1387,-0.1,-0.1,-0.1
1388,-0.1,-0.1,-0.1
1389,-0.1,-0.1,-0.1


In [8]:
# Fitting the data with kmeans of k=3

kmeans = KMeans(3)
kmeans.fit(features_array)
clusters = kmeans.fit_predict(features_array)
clusters

array([1, 2, 2, ..., 0, 0, 0])

In [9]:
# Adding a new column to the normalized_data

normalized_data['cluster_num'] = clusters
normalized_data

Unnamed: 0,Avg_TCP_transmition,Avg_RTT,Avg_TP,cluster_num
0,30.6,29.5,35.9,1
1,9.0,7.1,3.1,2
2,8.8,10.1,3.8,2
3,8.1,8.8,2.9,2
4,5.3,6.7,3.2,2
...,...,...,...,...
1386,-0.1,-0.1,-0.1,0
1387,-0.1,-0.1,-0.1,0
1388,-0.1,-0.1,-0.1,0
1389,-0.1,-0.1,-0.1,0


In [10]:
# Copying the data frame and adding a new column 

non_nomalized_data = data[features]
non_nomalized_data['cluster'] = clusters
non_nomalized_data.head(10)

Unnamed: 0,Avg_TCP_transmition,Avg_RTT,Avg_TP,cluster
0,641157423284.7,3033113.2,832626680.0,1
1,191475493090.4,735741.7,73815132.0,2
2,186582302387.6,1043816.5,89018073.0,2
3,171563885706.9,917579.4,68145840.0,2
4,112152550871.2,702387.9,76077278.0,2
5,104013271153.4,504845.8,57611989.0,2
6,94983823115.8,552962.6,61721810.0,2
7,93168536205.2,438956.8,31972066.0,2
8,92610116020.4,311853.2,62149259.0,2
9,81428128730.0,422667.9,17090119.0,2


In [12]:
satisfaction_data = non_nomalized_data.groupby('cluster').agg({'Avg_TCP_transmition':'mean','Avg_RTT':'mean','Avg_TP':'mean'})
satisfaction_data.head()

Unnamed: 0_level_0,Avg_TCP_transmition,Avg_RTT,Avg_TP
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,697273327.2,5053.4,417254.7
1,641157423284.7,3033113.2,832626680.0
2,95087765605.3,495164.7,48864927.9


##### From the above score we can find out that cluster 2 is the less engaged cluster since it has higher TCR retransmission and RTT but smaller TP