In [51]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

# PROBLEM STATEMENT
ABC is an online content sharing platform that enables users to create, upload and share the content in the form of videos. It includes videos from different genres like entertainment, education, sports, technology and so on. The maximum duration of video is 10 minutes.

Users can like, comment and share the videos on the platform. 

Based on the user’s interaction with the videos, engagement score is assigned to the video with respect to each user. Engagement score defines how engaging the content of the video is. 

Understanding the engagement score of the video improves the user’s interaction with the platform. It defines the type of content that is appealing to the user and engages the larger audience.

In [52]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Inspecting the Data Set to look for missing values or anomalies
Dataset.describe() gives summary statistics for data set.
We have to look out for minimum and maximum values to look for anomalies, what we are looking for is sudden change or something more than 2 standdard deviation away from mean, especially when its unexpected.



In [53]:
train.describe() 

Unnamed: 0,row_id,user_id,category_id,video_id,age,followers,views,engagement_score
count,89197.0,89197.0,89197.0,89197.0,89197.0,89197.0,89197.0,89197.0
mean,44599.0,13881.909806,18.323733,77.715383,24.848616,252.460172,502.980268,3.487797
std,25749.100318,8005.582771,11.675154,48.469656,8.955535,46.094468,268.569482,0.863498
min,1.0,1.0,1.0,1.0,10.0,160.0,30.0,0.0
25%,22300.0,6945.0,8.0,34.0,18.0,230.0,229.0,2.9
50%,44599.0,13892.0,16.0,76.0,23.0,240.0,467.0,3.71
75%,66898.0,20819.0,26.0,120.0,32.0,280.0,714.0,4.15
max,89197.0,27734.0,47.0,175.0,68.0,360.0,1000.0,5.0


In [54]:
test.describe()

Unnamed: 0,row_id,user_id,category_id,video_id,age,followers,views
count,11121.0,11121.0,11121.0,11121.0,11121.0,11121.0,11121.0
mean,94758.0,13825.7074,15.666487,79.742559,24.85415,249.691574,454.959986
std,3210.500506,8001.221954,10.313259,48.702501,8.953976,38.511469,249.759038
min,89198.0,2.0,1.0,1.0,10.0,160.0,44.0
25%,91978.0,6881.0,8.0,39.0,18.0,230.0,229.0
50%,94758.0,13857.0,12.0,80.0,23.0,240.0,369.0
75%,97538.0,20763.0,25.0,121.0,32.0,270.0,662.0
max,100318.0,27734.0,46.0,174.0,68.0,360.0,990.0


After seeing different quartiles and max as well as min values no anomally detected.
Also dataset is clean and doesn't require much cleaning.

In [55]:
train.isnull().sum()

row_id              0
user_id             0
category_id         0
video_id            0
age                 0
gender              0
profession          0
followers           0
views               0
engagement_score    0
dtype: int64

# DEFINING THE PROBLEM:-
Engagement with  video or any content is a joint product of both User and the content.
Engagement score is calculated based on users interaction with the content.
Certain users engage more with the content in comparison to other users.
Also certain content get more engagement in comparison to other content. Example:controverisial videos attract greater attention.
So lets try to figure out key determinant of engagement score:
1)Past engagement score of users(mapped with help of USER_ID)
2)Past engagement with a particular video(Mapped with Video_ID)
Infact user_Id and video_Id cover all data aboutnan individual and a video.
We use other demographic/video data to refer to or categorise an user, while user_id directly captures it.

Also The values with which we encode capture mean of past behaviour

So why use other variables?
Unless we use other demogrpahic data our model would be fitted only for a particular set of users and won't accomdate new users,s o when user ID or video id is absent our model won't be cluless.

So we are going to treat user id , video_id,video_category,gender and Profession as categorical variables.




In [56]:
uniq_users= len(pd.unique(train['user_id']))
uniq_videos = len(pd.unique(train['video_id']))
uniq_categories = len(pd.unique(train['category_id']))
print('The number of unique users are {0} ,number of unique videos are {1} and unique video categories are \t{2}'.format(uniq_users,uniq_videos,uniq_categories))

The number of unique users are 27734 ,number of unique videos are 175 and unique video categories are 	47


The test set also asks us to predict for same users, videos and video categories.

Since the number of categorical values are large we are going to do target encoding otherwise number of categorical variables would become unnecessary large.
It's better to encode in new columns as we have to use same values in test file.


In [57]:
df_user = train.groupby('user_id')['engagement_score'].mean()
df_user = df_user.reset_index()   
df_user_std = train.groupby(['user_id','category_id'])['engagement_score'].mean()
df_user_std = df_user_std.reset_index()
df_video_id =  train.groupby('video_id')['engagement_score'].mean() 
df_video_id = df_video_id.reset_index()
df_video_categories =  train.groupby('category_id')['engagement_score'].mean() 
df_video_categories = df_video_categories.reset_index()
    

We should add white noise to data to avoid the problem of overfitting,however adding noise decreases model efficiency as seen from the result of test dataset.

So the approach is avoided. Nevertheless it's a best practice, in case of targeted encoding.

Also we need to encode training and test set in the same manner, so that fitting the model and predicting results becomes easy.


In [58]:
train =train.merge(df_user, on='user_id',how = 'left')
test =test.merge(df_user, on='user_id',how = 'left')

In [59]:
test.rename(columns = {'engagement_score':'user_id_encoded'}, inplace = True)
train.rename(columns = {'engagement_score':'user_id_encoded'}, inplace = True)

In [60]:
train= train.merge(df_video_id, on='video_id',how = 'left')


train.rename(columns = {'engagement_score':'video_id_encoded'}, inplace = True)

In [61]:
test= test.merge(df_video_id, on='video_id',how = 'left')

test.rename(columns = {'engagement_score':'video_id_encoded'}, inplace = True)

In [62]:
train= train.merge(df_video_categories, on='category_id',how = 'left')
test= test.merge(df_video_categories, on='category_id',how = 'left')

In [63]:
train.rename(columns = {'engagement_score_y':'video_category_encoded'}, inplace = True)
test.rename(columns = {'engagement_score':'video_category_encoded'}, inplace = True)

In [64]:
df_user_std.rename(columns = {'engagement_score':'user_std_dev'}, inplace = True)

In [65]:
df_user_std['user_std_dev'] = (df_user_std['user_std_dev'] - df_user_std['user_std_dev'].mean())/(df_user_std['user_std_dev'].std())
df_user_std

Unnamed: 0,user_id,category_id,user_std_dev
0,1,1,-0.243207
1,1,2,-0.708722
2,2,3,0.120850
3,2,4,-0.859915
4,3,5,1.010103
...,...,...,...
49885,27731,46,-1.281664
49886,27732,29,0.461034
49887,27733,8,-0.726627
49888,27733,13,-1.818797


We are Done with target encoding and now coming to categorical variable encoding keeping in mind the dummy variable trap

In [66]:
train.loc[train['gender'] == 'Male', 'gender_encoded'] = 1
train.loc[train['gender'] == 'Female', 'gender_encoded'] = 0
test.loc[test['gender'] == 'Male', 'gender_encoded'] = 1
test.loc[test['gender'] == 'Female', 'gender_encoded'] = 0

In [67]:
train.loc[train['profession'] == 'Student', 'Student'] = 1
train.loc[train['profession'] != 'Student', 'Student'] = 0
test.loc[test['profession'] == 'Student', 'Student'] = 1
test.loc[test['profession'] != 'Student', 'Student'] = 0

In [68]:
train.loc[train['profession'] == 'Working Professional', 'Working_professional'] = 1
train.loc[train['profession'] != 'Working Professional', 'Working_professional'] = 0
test.loc[test['profession'] == 'Working Professional', 'Working_professional'] = 1
test.loc[test['profession'] != 'Working Professional', 'Working_professional'] = 0

In [69]:
train =train.merge(df_user_std, on='user_id',how = 'left')
test =test.merge(df_user_std, on='user_id',how = 'left')
train.rename(columns = {'engagement_score':'user_id_std'}, inplace = True)
test.rename(columns = {'engagement_score':'user_id_std'}, inplace = True)

In [70]:
train.columns

Index(['row_id', 'user_id', 'category_id_x', 'video_id', 'age', 'gender',
       'profession', 'followers', 'views', 'engagement_score_x',
       'video_category_encoded', 'video_id_encoded', 'user_id_std',
       'gender_encoded', 'Student', 'Working_professional', 'category_id_y',
       'user_std_dev'],
      dtype='object')

NOW THAT DATA OF TRINING SET IS CLEANED AND ENCODED, ITS BETTER TO SAVE IT IN A NEW CSV FILE FOR FURTHER MODEL FITTING

In [71]:
train_new = pd.read_csv('train_new.csv') # better to save one copy in a csv file, easy to experiment
test_new = pd.read_csv('test_new.csv')

Scale the non_categorical values to ensure better convergence.

In [72]:
Standard_scaler = StandardScaler() 


In [73]:
train_new['age'] = Standard_scaler.fit_transform(train_new[['age']])
train_new['followers'] =  Standard_scaler.fit_transform(train_new[['followers']])
train_new['views'] =  Standard_scaler.fit_transform(train_new[['views']])
train_new['user_id_encoded'] =  Standard_scaler.fit_transform(train_new[['user_id_encoded']])
train_new['video_id_encoded'] =  Standard_scaler.fit_transform(train_new[['video_id_encoded']])
train_new['video_category_encoded'] =  Standard_scaler.fit_transform(train_new[['video_category_encoded']])

In [74]:
test_new['age'] = Standard_scaler.fit_transform(test_new[['age']])
test_new['followers'] =  Standard_scaler.fit_transform(test_new[['followers']])
test_new['views'] =  Standard_scaler.fit_transform(test_new[['views']])
test_new['user_id_encoded'] =  Standard_scaler.fit_transform(test_new[['user_id_encoded']])
test_new['video_id_encoded'] =  Standard_scaler.fit_transform(test_new[['video_id_encoded']])
test_new['video_category_encoded'] =  Standard_scaler.fit_transform(test_new[['video_category_encoded']])

In [75]:
train_new= train_new.merge(df_user_std, on='user_id',how = 'left')
test_new= test_new.merge(df_user_std, on='user_id',how = 'left')

DATAFRAME IS NOW SCALED
TIME TO FIT THE MODELS AMD STUDY RELATIONSHIPS BETWEEN DATASETS

In [76]:
train_new.corr().loc['engagement_score'].sort_values(ascending = False)

engagement_score          1.000000
user_id_encoded           0.786260
user_std_dev              0.634308
gender_encoded            0.392167
video_id_encoded          0.320661
video_category_encoded    0.299665
Student                   0.287719
followers                 0.008562
user_id                  -0.002606
row_id                   -0.002870
video_id                 -0.030906
category_id_y            -0.050357
views                    -0.093545
category_id_x            -0.096454
Working_professional     -0.222198
age                      -0.231069
Name: engagement_score, dtype: float64

No surprises here as user id and video id show maximum correlation with engagement.
Note:- gender is already covered by user id

In [77]:
mu, sigma = 0, 0.2 # mean and standard deviation
s = np.random.normal(mu, sigma, len(train_new))
train_new['Modified engagement_Score'] = train_new['engagement_score'].values + s
#

In [None]:
#Features = ['user_id_encoded','age','gender_encoded','Student','Working_professional','followers','video_category_encoded','video_id_encoded','views','followers','user_std_dev']
Features = ['user_std_dev']
x = df_user_std[Features].values
y = df_user_std[['engagement_score']].values

# Feature selection:-
User_id alone gave R2 score of .37 in test variables.

User_id + video_id explained 40% of variance

Rest of the features account for very less improvement if any at all.


# Model Slection:-
After Going through a number of regression models from linear to neural networks. Three models which gave best performance were shortlisted.
The three are:-
1)Linear regression
2)Neural network regressor
3)Random forest regressor
*Random Forest worked best on training data but was over fitted and performance degraded for testing data.

*Linear regressor peroformed best on test data and was most cnsisitent. It showed acceptable bias and comparitively low variance.

*Neural Network model showed low variance and acceptable bias, infact bias was lower than linear regression for training data. However it performed poorly for training data. 
# Why Linear regression?
THe reason behind better performance of Linear regression over other algorithms can be attributed to the fact that dataset is comparitively small and spread over too many categories. This results in algorithms like Neural Networks failing to converge,Algorithms like random forest overfit the data wheras simple algorithm like Linear regression performs best for dataset.

Linear regression did the best job and gave the best result

In [84]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.05, random_state = 22)
regressor1 = LinearRegression()
regressor1.fit(x_train,y_train)
y_pred = regressor1.predict(x_test)
r2_score(y_test,y_pred)

0.6549083802710577

In [None]:
regr = MLPRegressor(random_state = 2, max_iter=1000)

In [None]:
regr.fit(x_train,y_train)

  return f(*args, **kwargs)


In [None]:
y_pred = regr.predict(x_test)
test_new['engagement_score']= pd.DataFrame(y_pred)
submission = test_new[['row_id','engagement_score']]

r2_score(y_test,y_pred)

In [None]:
regrf = RandomForestRegressor(n_estimators = 100,random_state = 0)
regrf.fit(x,y)

In [None]:
y_pred = regrf.predict(x_test)
r2_score(y_test,y_pred)

In [None]:
x_test_leaderboard = test_new[Features].values

In [None]:
y_pred = regressor1.predict(x_test_leaderboard)
test_new['engagement_score']= pd.DataFrame(y_pred)
submission = test_new.loc[:,['row_id','engagement_score']]
submission.to_csv('submission_Linear_regression.csv')

In [None]:
y_pred = regr.predict(x_test_leaderboard)
test_new['engagement_score']= pd.DataFrame(y_pred)
submission = test_new.loc[:,['row_id','engagement_score']]
submission.to_csv('submission_neural network.csv')

In [None]:
#y_pred = regrf.predict(x_test_leaderboard)
test_new['engagement_score']= pd.DataFrame(y_pred)
submission = test_new.loc[:,['row_id','engagement_score']]
submission.to_csv('submission_tree.csv')

Linear regression gave the best result