<h1>User Engagement Analysis</h1>

In [42]:
# Importing the neccesary libraries and packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append("..")
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [32]:
# Reading the data 

engagement_data = pd.read_csv('../data/user_overview_data.csv')
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [33]:
# Rename the columns for simplicity

engagement_data.rename(columns={'Bearer Id':'Session Frequency', 'Dur. (ms)':'Total Duration', 'Total_UL_and_DL':'Total traffic in MBs'}, inplace= True)

In [34]:
# Grouping and aggregating the data

engagement_data = engagement_data.groupby('MSISDN/Number')
engagement_data = engagement_data.agg({'Session Frequency':'count','Total Duration':'sum','Total traffic in MBs':'sum'})

In [35]:
# Creating a data set containing only the specified metrics

engagement_cols = ['Session Frequency', 'Total Duration', 'Total traffic in MBs']
engagement_agg = engagement_data[engagement_cols]

In [36]:
# Sorting to see the top 10 users based on session frequency metric

engagement_agg = engagement_agg.sort_values(by='Session Frequency', ascending=False)
engagement_agg.head(10)

Unnamed: 0_level_0,Session Frequency,Total Duration,Total traffic in MBs
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33626320676.0,18,1785419.0,7971.167
33614892860.0,17,1678397.0,7602.058
33625779332.0,17,1599052.0,7708.276
33659725664.0,16,1905478.0,6932.267
33760536639.0,15,1651084.0,8514.774
33675877202.0,15,1745131.0,7891.111
33667163239.0,13,1209703.0,5618.394
33604515716.0,12,1193033.0,5081.583
33760413819.0,12,1248477.0,5902.169
33603127838.0,12,1109736.0,4976.195


In [37]:
# Sorting to see the top 10 users based on Total duration metric

engagement_agg = engagement_agg.sort_values(by='Total Duration', ascending=False)
engagement_agg.head(10)

Unnamed: 0_level_0,Session Frequency,Total Duration,Total traffic in MBs
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33659725664.0,16,1905478.0,6932.267
33626320676.0,18,1785419.0,7971.167
33659359429.0,11,1747994.0,4658.437
33675877202.0,15,1745131.0,7891.111
33614892860.0,17,1678397.0,7602.058
33760536639.0,15,1651084.0,8514.774
33625779332.0,17,1599052.0,7708.276
33786323068.0,12,1427807.0,5622.232
33664712899.0,11,1334982.0,6472.786
33606582008.0,9,1322251.0,4352.745


In [38]:
# Sorting to see the top 10 users based on Total traffic in MBs

engagement_agg = engagement_agg.sort_values(by='Total traffic in MBs', ascending=False)
engagement_agg.head(10)

Unnamed: 0_level_0,Session Frequency,Total Duration,Total traffic in MBs
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33760536639.0,15,1651084.0,8514.774
33626320676.0,18,1785419.0,7971.167
33675877202.0,15,1745131.0,7891.111
33625779332.0,17,1599052.0,7708.276
33614892860.0,17,1678397.0,7602.058
33659725664.0,16,1905478.0,6932.267
33666464084.0,11,1168703.0,6530.983
33664712899.0,11,1334982.0,6472.786
33698792269.0,11,1052303.0,6149.754
33658361927.0,10,1173433.0,6051.52


In [39]:
# Finiding out the metric features and normalizing them

features = ['Session Frequency','Total Duration','Total traffic in MBs']
features_array = engagement_agg.loc[:,features].values
features_array = StandardScaler().fit_transform(features_array)
features_array

array([[16.85918142, 15.12169104, 16.85321169],
       [20.57669774, 16.45580955, 15.68633385],
       [16.85918142, 16.05569814, 15.51448841],
       ...,
       [-0.48922805,  0.61443461, -1.35117693],
       [-0.48922805,  0.37483255, -1.35208661],
       [-0.48922805, -0.85007167, -1.35278816]])

In [40]:
# Making sure if the data is normalized well or not
# Normalized data has a mean of 0 and std of 1

np.mean(features_array), np.std(features_array), features_array.shape

(1.138957652660259e-17, 1.0, (106471, 3))

In [41]:
# Changing my array into dataframe

normalized_data = pd.DataFrame(features_array,columns=features)
normalized_data

Unnamed: 0,Session Frequency,Total Duration,Total traffic in MBs
0,16.859,15.122,16.853
1,20.577,16.456,15.686
2,16.859,16.056,15.514
3,19.338,14.605,15.122
4,19.338,15.393,14.894
...,...,...,...
106466,-0.489,-0.895,-1.343
106467,-0.489,-0.256,-1.343
106468,-0.489,0.614,-1.351
106469,-0.489,0.375,-1.352


In [48]:
# Fitting the data with kmeans of k=3

kmeans = KMeans(3)
kmeans.fit(features_array)
clusters = kmeans.fit_predict(features_array)
clusters

array([2, 2, 2, ..., 1, 1, 1])