engagement score to each user. Consider the engagement score as the Euclidean distance between the user data point & the less engaged cluster (use the first clustering for this)

In [17]:
import pandas as pd
import sys  
sys.path.insert(0, '../scripts')
sys.path.insert(0, '../models')
import clean_data
import utilities
import norm_scaling
import loading_data
import pickle
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

In [18]:
#loaded file
df= loading_data.load_csv('../data/data.csv')
cleaned_data = clean_data.Handle_missing_values(df,drop_cols=False,drop_rows=False)



  
['Start', 'End', 'Last Location Name', 'Handset Manufacturer', 'Handset Type']


In [19]:
#load our trained kmeans model for user engagement and user experience
with open("../models/user_engag_model.pkl", 'rb') as file:  
    Pickled_userEngagement_Model = pickle.load(file)
with open("../models/user_exper_model.pkl", 'rb') as file:  
    Pickled_userExperience_Model = pickle.load(file)   


To calculate the euclidean distance between each datapoint and the less engaged user cluster we will use the centroid of cluster 0 and each normalized user data points

In [20]:
#the center for the first cluster
engagement_cluster0_center = Pickled_userEngagement_Model.cluster_centers_[0,:]
experience_cluster0_center = Pickled_userExperience_Model.cluster_centers_[1,:]
engagement_cluster0_center,experience_cluster0_center

(array([0.11979774, 0.01099528, 0.11897878, 0.12120409]),
 array([0.00089951, 0.00530876, 0.00801262, 0.01002238, 0.27762946,
        0.00103293]))

In [21]:
# now we take the datapoints for the appropriate user engagement metrics  
cleaned_copy = cleaned_data.copy()
df_user = cleaned_copy.groupby("MSISDN/Number")


df_session_freq = pd.DataFrame(df_user["Bearer Id"].count())
df_session_dur = pd.DataFrame(df_user["Dur. (ms)"].sum())
df_session_UL = pd.DataFrame(df_user["Total UL (Bytes)"].sum())
df_session_DL = pd.DataFrame(df_user["Total DL (Bytes)"].sum())

df_RTT_DL = pd.DataFrame(df_user["Avg RTT DL (ms)"].sum())
df_RTT_UL = pd.DataFrame(df_user["Avg RTT UL (ms)"].sum())
df_TP_DL = pd.DataFrame(df_user["Avg Bearer TP DL (kbps)"].sum())
df_TP_UL = pd.DataFrame(df_user["Avg Bearer TP UL (kbps)"].sum())
df_TCP_DL = pd.DataFrame(df_user["TCP DL Retrans. Vol (Bytes)"].sum())
df_TCP_UL = pd.DataFrame(df_user["TCP UL Retrans. Vol (Bytes)"].sum())

In [22]:
# normalize them 
normalized_data_freq = norm_scaling.normalize(df_session_freq)
normalized_data_dur = norm_scaling.normalize(df_session_dur)
normalized_data_UL = norm_scaling.normalize(df_session_DL)
normalized_data_DL = norm_scaling.normalize(df_session_UL)

normalized_RTT_DL = norm_scaling.normalize(df_RTT_DL)
normalized_RTT_UL = norm_scaling.normalize(df_RTT_UL)
normalized_TP_DL = norm_scaling.normalize(df_TP_DL)
normalized_TP_UL = norm_scaling.normalize(df_TP_UL)
normalized_TCP_DL = norm_scaling.normalize(df_TCP_DL)
normalized_TCP_UL = norm_scaling.normalize(df_TCP_UL)


X_engag = np.hstack((normalized_data_freq,normalized_data_dur,normalized_data_DL,normalized_data_UL))
X_exper = np.hstack((normalized_RTT_DL,normalized_RTT_UL,normalized_TP_DL,normalized_TP_UL,normalized_TCP_DL,normalized_TCP_UL))

In [23]:
# calculate satisfaction score 
engagement_score= euclidean_distances(X_engag,engagement_cluster0_center.reshape(1, -1))
experience_score= euclidean_distances(X_exper,experience_cluster0_center.reshape(1, -1))
satisfaction_score = ((engagement_score+experience_score)/2).flatten()
df_user_score = pd.DataFrame({"user_id":df_user["MSISDN/Number"].unique(),"satisfaction_score":satisfaction_score}).sort_values(by="satisfaction_score",ascending=False).head(10)

df_user_score,satisfaction_score



(                           user_id  satisfaction_score
 MSISDN/Number                                         
 33,663,706,799.00  [33663706799.0]                1.97
 33,669,054,076.00  [33669054076.0]                0.62
 33,658,249,284.00  [33658249284.0]                0.39
 33,763,588,772.00  [33763588772.0]                0.38
 33,667,725,464.00  [33667725464.0]                0.36
 33,659,084,281.00  [33659084281.0]                0.34
 33,659,778,586.00  [33659778586.0]                0.33
 33,662,317,023.00  [33662317023.0]                0.32
 33,665,090,461.00  [33665090461.0]                0.32
 33,664,698,321.00  [33664698321.0]                0.31,
 array([0.24228107, 0.24269073, 0.24242186, ..., 0.24254936, 0.24228781,
        0.24247019]))

Build a regression model of your choice to predict the satisfaction score of a customer. To build satisfaction score we will take all engagement and experience analysis metrics. So since we have a continous type y, we will use a simple linear regression.

In [24]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans

In [25]:
X = np.hstack((X_engag,X_exper))

In [26]:
clf = LinearRegression().fit(X, satisfaction_score)

In [27]:
clf.coef_

array([ 2.53073228, -0.30941127, -0.1129131 , -0.21050975,  0.27175037,
        0.08032218, -0.29057944, -0.07060393, -0.29852672,  0.30356674])

Kmeans(k=2) on engagement and experience scores

In [28]:
X_scores = np.hstack((engagement_score,experience_score))
X_scores.shape

(2,)

In [29]:
kmeans = KMeans(n_clusters=2).fit(X_scores)
pickle.dump(kmeans, open("../models/scores_model.pkl", 'wb'))
kmeans_pred = kmeans.predict(X_scores)

Aggregate the average satisfaction & experience score per cluster.

In [15]:
#collect each score values in each cluster
cluster_no = 2
clusters = {}
container_eng = []
container_exp = []
for i in range(0,cluster_no):
    for x_eng,x_exp,y in zip(X_scores[:,0],X_scores[:,1],kmeans_pred):    
        if y == i:
            container_eng.append(x_eng)
            container_exp.append(x_exp)
    clusters["cluster_"+str(i)] = {"engagement_scores":np.array(container_eng),"experience_scores":np.array(container_exp)}
    container_eng = []
    container_exp = []

In [17]:
#calculate the average scores of each cluster
score_names = ["engagement_scores","experience_scores"]
scores = 2
print()
for i in range(2):
    for score in range(0,scores):
        data_cluster = clusters["cluster_"+str(i)][score_names[score]]
        print(len(data_cluster))
        print("cluster "+str(i)+ ",avg of "+score_names[score],data_cluster.mean())



106567
cluster 0,avg of engagement_scores 0.20641571458435432
106567
cluster 0,avg of experience_scores 0.2774985875307438
289
cluster 1,avg of engagement_scores 0.20491321865085432
289
cluster 1,avg of experience_scores 0.09849401879379184


Export your final table containing all user id + engagement, experience & satisfaction scores in your local MySQL database. Report a screenshot of a select query output on the exported table. 



In [201]:
# prepare a dataframe with user_id, engagement_score, experience_score and satisfaction_score
df_scores = pd.DataFrame({"user_id":df_user["MSISDN/Number"].groups.keys(),"engagement_score":engagement_score.flatten().astype(np.float64),"experience_score":experience_score.flatten(),"satisfaction_score":satisfaction_score})
print(df_scores.head(10))

# save this dataframe to the database
df_scores.to_csv('../data/user_data_scores.csv', index=False)
print('File Successfully Saved.!!!')

            user_id  engagement_score  experience_score  satisfaction_score
0 33,601,001,722.00              0.09              0.28                0.19
1 33,601,001,754.00              0.09              0.28                0.19
2 33,601,002,511.00              0.09              0.28                0.19
3 33,601,007,832.00              0.09              0.28                0.19
4 33,601,008,617.00              0.09              0.28                0.18
5 33,601,010,682.00              0.09              0.28                0.19
6 33,601,011,634.00              0.09              0.28                0.19
7 33,601,011,959.00              0.09              0.28                0.19
8 33,601,014,694.00              0.09              0.28                0.19
9 33,601,020,306.00              0.09              0.28                0.19
File Successfully Saved.!!!


Task 4.7 Model deployment tracking- deploy the model and monitor your model. Here you can use Docker or other MlOps tools which can help you to track your model’s change.  Your model tracking report includes code version, start and end time, source, parameters, metrics (loss convergence) and artifacts or any output file regarding each specific run. (CSV file, screenshot)