<h1>User Experience Analysis</h1>

In [2]:
# Importing the neccesary libraries and packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append("..")
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [14]:
# Read the clean telecom data

data = pd.read_csv('../data/clean_telecom_data.csv')
primary_data = pd.read_csv('../data/Week1_challenge_data_source.csv')
data.columns.tolist()

['Bearer Id',
 'Start',
 'Start ms',
 'End',
 'End ms',
 'Dur. (ms)',
 'IMSI',
 'MSISDN/Number',
 'IMEI',
 'Last Location Name',
 'Avg RTT DL (ms)',
 'Avg RTT UL (ms)',
 'Avg Bearer TP DL (kbps)',
 'Avg Bearer TP UL (kbps)',
 'DL TP < 50 Kbps (%)',
 '50 Kbps < DL TP < 250 Kbps (%)',
 '250 Kbps < DL TP < 1 Mbps (%)',
 'DL TP > 1 Mbps (%)',
 'UL TP < 10 Kbps (%)',
 '10 Kbps < UL TP < 50 Kbps (%)',
 '50 Kbps < UL TP < 300 Kbps (%)',
 'UL TP > 300 Kbps (%)',
 'Activity Duration DL (ms)',
 'Activity Duration UL (ms)',
 'Dur. (ms).1',
 'Handset Manufacturer',
 'Handset Type',
 'Nb of sec with Vol DL < 6250B',
 'Nb of sec with Vol UL < 1250B',
 'Social Media DL (Bytes)',
 'Social Media UL (Bytes)',
 'Google DL (Bytes)',
 'Google UL (Bytes)',
 'Email DL (Bytes)',
 'Email UL (Bytes)',
 'Youtube DL (Bytes)',
 'Youtube UL (Bytes)',
 'Netflix DL (Bytes)',
 'Netflix UL (Bytes)',
 'Gaming DL (Bytes)',
 'Gaming UL (Bytes)',
 'Other DL (Bytes)',
 'Other UL (Bytes)',
 'Total UL (Bytes)',
 'Total DL (By

In [22]:
# Add the tcp column

data['TCP DL Retrans. Vol (Bytes)'] = primary_data['TCP DL Retrans. Vol (Bytes)']
data['TCP UL Retrans. Vol (Bytes)'] = primary_data['TCP UL Retrans. Vol (Bytes)']
data.isna().sum()

Bearer Id                             0
Start                                 0
Start ms                              0
End                                   0
End ms                                0
Dur. (ms)                             0
IMSI                                  0
MSISDN/Number                         0
IMEI                                  0
Last Location Name                    0
Avg RTT DL (ms)                       0
Avg RTT UL (ms)                       0
Avg Bearer TP DL (kbps)               0
Avg Bearer TP UL (kbps)               0
DL TP < 50 Kbps (%)                   0
50 Kbps < DL TP < 250 Kbps (%)        0
250 Kbps < DL TP < 1 Mbps (%)         0
DL TP > 1 Mbps (%)                    0
UL TP < 10 Kbps (%)                   0
10 Kbps < UL TP < 50 Kbps (%)         0
50 Kbps < UL TP < 300 Kbps (%)        0
UL TP > 300 Kbps (%)                  0
Activity Duration DL (ms)             0
Activity Duration UL (ms)             0
Dur. (ms).1                           0


In [23]:
# Fixing the missing values by using mean

data['TCP DL Retrans. Vol (Bytes)'] = data['TCP DL Retrans. Vol (Bytes)'].fillna(data['TCP DL Retrans. Vol (Bytes)'].mean())
data['TCP UL Retrans. Vol (Bytes)'] = data['TCP UL Retrans. Vol (Bytes)'].fillna(data['TCP UL Retrans. Vol (Bytes)'].mean())
data.isna().sum()

Bearer Id                         0
Start                             0
Start ms                          0
End                               0
End ms                            0
Dur. (ms)                         0
IMSI                              0
MSISDN/Number                     0
IMEI                              0
Last Location Name                0
Avg RTT DL (ms)                   0
Avg RTT UL (ms)                   0
Avg Bearer TP DL (kbps)           0
Avg Bearer TP UL (kbps)           0
DL TP < 50 Kbps (%)               0
50 Kbps < DL TP < 250 Kbps (%)    0
250 Kbps < DL TP < 1 Mbps (%)     0
DL TP > 1 Mbps (%)                0
UL TP < 10 Kbps (%)               0
10 Kbps < UL TP < 50 Kbps (%)     0
50 Kbps < UL TP < 300 Kbps (%)    0
UL TP > 300 Kbps (%)              0
Activity Duration DL (ms)         0
Activity Duration UL (ms)         0
Dur. (ms).1                       0
Handset Manufacturer              0
Handset Type                      0
Nb of sec with Vol DL < 6250

In [27]:
# Add a new columns with Avg_TCP,Avt_RTT and Avg_TP
data['Avg_TCP_transmition'] = data['TCP DL Retrans. Vol (Bytes)'] + data['TCP UL Retrans. Vol (Bytes)']
data['Avg_RTT'] = data['Avg RTT DL (ms)'] + data['Avg RTT UL (ms)']
data['Avg_TP'] = data['Avg Bearer TP DL (kbps)'] + data['Avg Bearer TP UL (kbps)']
data.columns.tolist()

['Bearer Id',
 'Start',
 'Start ms',
 'End',
 'End ms',
 'Dur. (ms)',
 'IMSI',
 'MSISDN/Number',
 'IMEI',
 'Last Location Name',
 'Avg RTT DL (ms)',
 'Avg RTT UL (ms)',
 'Avg Bearer TP DL (kbps)',
 'Avg Bearer TP UL (kbps)',
 'DL TP < 50 Kbps (%)',
 '50 Kbps < DL TP < 250 Kbps (%)',
 '250 Kbps < DL TP < 1 Mbps (%)',
 'DL TP > 1 Mbps (%)',
 'UL TP < 10 Kbps (%)',
 '10 Kbps < UL TP < 50 Kbps (%)',
 '50 Kbps < UL TP < 300 Kbps (%)',
 'UL TP > 300 Kbps (%)',
 'Activity Duration DL (ms)',
 'Activity Duration UL (ms)',
 'Dur. (ms).1',
 'Handset Manufacturer',
 'Handset Type',
 'Nb of sec with Vol DL < 6250B',
 'Nb of sec with Vol UL < 1250B',
 'Social Media DL (Bytes)',
 'Social Media UL (Bytes)',
 'Google DL (Bytes)',
 'Google UL (Bytes)',
 'Email DL (Bytes)',
 'Email UL (Bytes)',
 'Youtube DL (Bytes)',
 'Youtube UL (Bytes)',
 'Netflix DL (Bytes)',
 'Netflix UL (Bytes)',
 'Gaming DL (Bytes)',
 'Gaming UL (Bytes)',
 'Other DL (Bytes)',
 'Other UL (Bytes)',
 'Total UL (Bytes)',
 'Total DL (By

In [30]:
# Sampling out the usefull columns for my analysis

useful_columns = ['Avg_TCP_transmition','Avg_RTT','Avg_TP','Handset Type','MSISDN/Number']

analysis_data = data[useful_columns]
analysis_data.columns.tolist()

['Avg_TCP_transmition', 'Avg_RTT', 'Avg_TP', 'Handset Type', 'MSISDN/Number']

In [35]:
# Aggregating and finding the total per id
# Changing the bytes to MBs

pd.set_option('display.float_format', lambda x: '%.3f' % x)
grouped_data_total = analysis_data.groupby('MSISDN/Number').agg({'Avg_TCP_transmition':'sum', 'Avg_RTT':'sum', 'Avg_TP':'sum'})
grouped_data_total['Avg_TCP_transmition'] = grouped_data_total['Avg_TCP_transmition']/10**6

In [41]:
# Top 10 customers based on TCP transmission

grouped_data_total = grouped_data_total.sort_values(by='Avg_TCP_transmition',ascending=False)
grouped_data_total.head(10)

Unnamed: 0_level_0,Avg_TCP_transmition,Avg_RTT,Avg_TP
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33763554982.0,6131.138,354.656,3775.0
33662577723.0,4419.671,121.0,44939.0
33660325702.0,4357.575,652.0,178695.0
33763887232.0,4352.475,141.0,274399.0
33664402022.0,4297.079,1735.0,182269.0
33763023112.0,4294.432,26.0,91.0
33650384835.0,4292.242,63.0,59254.0
33763875786.0,4288.121,66.0,38330.0
33662969744.0,4275.492,123.828,17.0
33650287330.0,4268.647,30.0,63.0


In [42]:
# Top 10 customers based on RTT transmission

grouped_data_total = grouped_data_total.sort_values(by='Avg_RTT',ascending=False)
grouped_data_total.head(10) 

Unnamed: 0_level_0,Avg_TCP_transmition,Avg_RTT,Avg_TP
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33662317023.0,21.649,96924.0,3.0
33660874265.0,43.298,64670.0,194.0
33683692867.0,21.649,54848.0,21.0
33698551167.0,2.034,46021.0,303.0
33761813523.0,54.801,37084.0,21200.0
33668791629.0,25.017,36304.0,108.0
33760941100.0,0.062,27278.0,5148.0
33606788933.0,0.003,26300.0,144.0
33671816754.0,21.649,25715.0,71.0
33781865588.0,21.649,25388.0,1608.0
