In [511]:
# !wget https://datahack-prod.s3.amazonaws.com/train_file/train_0OECtn8.csv
# !wget https://datahack-prod.s3.amazonaws.com/test_file/test_1zqHu22.csv

In [512]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import seaborn as sns

from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score,make_scorer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

In [513]:
raw_train = pd.read_csv("train_0OECtn8.csv")
raw_test = pd.read_csv("test_1zqHu22.csv")

In [514]:
print(f"Train shape {raw_train.shape}, Test shape {raw_test.shape}")

Train shape (89197, 10), Test shape (11121, 9)


In [515]:
raw_train.head(3)

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
0,1,19990,37,128,24,Male,Student,180,1000,4.33
1,2,5304,32,132,14,Female,Student,330,714,1.79
2,3,1840,12,24,19,Male,Student,180,138,4.35


In [516]:
raw_train.tail(3)

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
89194,89195,13655,16,97,25,Male,Student,270,462,4.23
89195,89196,24840,9,18,35,Male,Working Professional,230,819,3.77
89196,89197,27183,25,150,13,Male,Student,240,317,4.31


In [517]:
raw_test.head(3)

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views
0,89198,7986,12,42,14,Male,Student,180,138
1,89199,11278,34,115,14,Male,Student,230,840
2,89200,17245,8,110,44,Female,Working Professional,280,628


In [518]:
raw_test.tail(3)

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views
11118,100316,2042,16,98,22,Male,Student,270,462
11119,100317,24626,8,16,33,Male,Other,280,628
11120,100318,967,8,74,33,Male,Working Professional,280,628


In [519]:
raw_train.isna().sum()

row_id              0
user_id             0
category_id         0
video_id            0
age                 0
gender              0
profession          0
followers           0
views               0
engagement_score    0
dtype: int64

In [520]:
raw_test.isna().sum()

row_id         0
user_id        0
category_id    0
video_id       0
age            0
gender         0
profession     0
followers      0
views          0
dtype: int64

In [521]:
raw_train.describe()

Unnamed: 0,row_id,user_id,category_id,video_id,age,followers,views,engagement_score
count,89197.0,89197.0,89197.0,89197.0,89197.0,89197.0,89197.0,89197.0
mean,44599.0,13881.909806,18.323733,77.715383,24.848616,252.460172,502.980268,3.487797
std,25749.100318,8005.582771,11.675154,48.469656,8.955535,46.094468,268.569482,0.863498
min,1.0,1.0,1.0,1.0,10.0,160.0,30.0,0.0
25%,22300.0,6945.0,8.0,34.0,18.0,230.0,229.0,2.9
50%,44599.0,13892.0,16.0,76.0,23.0,240.0,467.0,3.71
75%,66898.0,20819.0,26.0,120.0,32.0,280.0,714.0,4.15
max,89197.0,27734.0,47.0,175.0,68.0,360.0,1000.0,5.0


In [522]:
raw_train.describe(include=np.object)

Unnamed: 0,gender,profession
count,89197,89197
unique,2,3
top,Male,Student
freq,52397,44638


In [523]:
raw_train['user_id'].value_counts().min(),raw_train['user_id'].value_counts().max()

(2, 10)

In [524]:
raw_train['user_id'].value_counts().head(3)

5198     10
13218    10
1448     10
Name: user_id, dtype: int64

In [525]:
raw_train[raw_train['user_id']==5198]

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
22708,22709,5198,5,56,32,Male,Other,240,229,4.3
28089,28090,5198,5,79,32,Male,Other,240,229,4.6
32687,32688,5198,5,9,32,Male,Other,240,229,4.5
33424,33425,5198,5,169,32,Male,Other,240,229,4.28
35305,35306,5198,5,163,32,Male,Other,240,229,4.26
48919,48920,5198,5,161,32,Male,Other,240,229,4.47
62534,62535,5198,5,69,32,Male,Other,240,229,4.13
73528,73529,5198,5,90,32,Male,Other,240,229,4.44
79107,79108,5198,5,10,32,Male,Other,240,229,4.51
85219,85220,5198,5,155,32,Male,Other,240,229,4.44


In [526]:
common_user_id = set(raw_train['user_id'].unique()).intersection(set(raw_test['user_id'].unique()))
len(common_user_id)

10384

In [527]:
raw_train['user_id'].nunique(), raw_test['user_id'].nunique()

(27734, 10384)

In [528]:
raw_train['category_id'].value_counts(normalize=True).sort_index()

1     0.020292
2     0.001872
3     0.020685
4     0.041235
5     0.090855
6     0.015684
7     0.021133
8     0.070776
9     0.021144
10    0.013644
11    0.042221
12    0.043600
13    0.031133
14    0.006009
15    0.024261
16    0.036593
17    0.013543
18    0.025124
19    0.052457
20    0.007870
21    0.045327
22    0.004103
23    0.029945
24    0.005875
25    0.054710
26    0.012960
27    0.012366
28    0.014675
29    0.010269
30    0.006144
31    0.020797
32    0.030405
33    0.005796
34    0.036963
35    0.011054
36    0.011491
37    0.010785
38    0.005987
39    0.015012
40    0.005135
41    0.006110
42    0.024418
43    0.011424
44    0.002399
45    0.002545
46    0.002646
47    0.000527
Name: category_id, dtype: float64

In [529]:
raw_train['video_id'].value_counts(normalize=False).head(10)

112    1337
53     1334
1      1282
65     1103
42     1077
46      938
4       932
10      921
5       913
87      902
Name: video_id, dtype: int64

In [530]:
raw_train[raw_train['video_id']==10]

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
115,116,476,5,10,21,Female,Student,240,229,2.48
181,182,16877,5,10,29,Female,Other,240,229,3.22
214,215,13064,5,10,33,Female,Other,240,229,2.30
357,358,25281,5,10,16,Male,Student,240,229,3.80
399,400,3718,5,10,14,Female,Student,240,229,4.57
...,...,...,...,...,...,...,...,...,...,...
88844,88845,16113,5,10,15,Female,Student,240,229,2.80
88901,88902,21712,5,10,24,Male,Other,240,229,3.64
89066,89067,17572,5,10,13,Male,Student,240,229,3.62
89133,89134,26217,5,10,20,Female,Student,240,229,2.19


In [531]:
raw_test[raw_test['video_id']==10]

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views
51,89249,4033,5,10,26,Female,Working Professional,240,229
92,89290,25610,5,10,40,Female,Working Professional,240,229
285,89483,19113,5,10,15,Female,Student,240,229
362,89560,15928,5,10,25,Female,Other,240,229
371,89569,8313,5,10,25,Male,Other,240,229
...,...,...,...,...,...,...,...,...,...
10901,100099,5556,5,10,22,Female,Working Professional,240,229
11013,100211,8865,5,10,32,Female,Working Professional,240,229
11039,100237,18756,5,10,35,Male,Other,240,229
11050,100248,4483,5,10,36,Female,Other,240,229


In [532]:
raw_test[raw_test['video_id']==1].sort_values(by='user_id')

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views
56,89254,16,19,1,16,Female,Student,230,369
5032,94230,78,1,1,18,Male,Student,360,990
9168,98366,362,19,1,31,Male,Other,230,369
9474,98672,1041,19,1,16,Female,Student,230,369
5133,94331,1080,19,1,26,Male,Other,230,369
...,...,...,...,...,...,...,...,...,...
8440,97638,27347,1,1,32,Male,Other,360,990
7058,96256,27390,1,1,13,Female,Student,360,990
1418,90616,27479,19,1,36,Female,Working Professional,230,369
9950,99148,27501,19,1,14,Female,Student,230,369


In [533]:
raw_train[raw_train['video_id']==1].sort_values(by='user_id')

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
30684,30685,1,1,1,18,Female,Student,360,990,2.88
82236,82237,10,1,1,26,Female,Student,360,990,2.70
1105,1106,36,19,1,15,Male,Student,230,369,4.25
10818,10819,51,19,1,18,Female,Student,230,369,4.29
43691,43692,56,1,1,33,Female,Other,360,990,2.30
...,...,...,...,...,...,...,...,...,...,...
68490,68491,27663,1,1,37,Male,Working Professional,360,990,3.32
51117,51118,27669,19,1,18,Male,Student,230,369,4.21
34829,34830,27691,19,1,30,Male,Other,230,369,3.81
75844,75845,27696,1,1,22,Female,Other,360,990,4.25


In [534]:
raw_train.groupby(['video_id'])[['followers','views','category_id']].std()

Unnamed: 0_level_0,followers,views,category_id
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,64.827241,309.674744,8.97608
2,0.000000,0.000000,0.00000
3,0.000000,0.000000,0.00000
4,0.000000,0.000000,0.00000
5,0.000000,0.000000,0.00000
...,...,...,...
171,0.000000,0.000000,0.00000
172,0.000000,0.000000,0.00000
173,0.000000,0.000000,0.00000
174,0.000000,0.000000,0.00000


In [535]:
raw_train.groupby(['user_id'])[['followers','views','category_id']].std()

Unnamed: 0_level_0,followers,views,category_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,80.829038,554.256258,0.577350
2,21.908902,87.635609,0.547723
3,17.320508,76.787586,0.577350
4,115.470054,301.954191,3.464102
5,28.867513,110.273901,0.577350
...,...,...,...
27730,57.735027,72.746134,1.154701
27731,14.142136,144.249783,1.414214
27732,0.000000,0.000000,0.000000
27733,34.641016,0.000000,2.886751


In [536]:
raw_train['video_id'].nunique(),raw_test['video_id'].nunique()

(175, 128)

In [537]:
common_video_id = set(raw_train['video_id'].unique()).intersection(set(raw_test['video_id'].unique()))
len(common_video_id)

128

In [538]:
raw_train.groupby(['video_id'])['category_id'].nunique().describe()

count    175.000000
mean       1.005714
std        0.075593
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        2.000000
Name: category_id, dtype: float64

In [539]:
raw_train.groupby(['user_id'])['profession'].nunique().describe()

count    27734.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: profession, dtype: float64

In [540]:
df = pd.concat([raw_train,raw_test],axis=0,ignore_index=True,sort=False)
df.shape[0] == raw_train.shape[0]+raw_test.shape[0]

True

In [541]:
df.sample(3)

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
46908,46909,26122,18,59,28,Male,Student,180,444,4.66
91840,91841,1656,32,145,35,Female,Other,330,714,
91268,91269,10273,31,152,16,Male,Student,230,156,


In [542]:
df['category_id'].nunique(),df['video_id'].nunique()

(47, 175)

In [543]:
def cv(x):
    return np.mean(x)/np.mean(0.001+np.std(x))

In [544]:
user_profile = df.groupby(['user_id']).agg({'category_id':pd.Series.nunique,
                                            'video_id':pd.Series.nunique,
                                            'views':['mean','sum','std','count'],
                                            'followers':['mean','sum','std'],
                                            # 'engagement_score': cv,
                                            })

user_profile.columns = ["_".join(x) for x in user_profile.columns]
user_profile =user_profile.add_suffix("_usr")
user_profile = user_profile.reset_index()
user_profile.describe()

Unnamed: 0,user_id,category_id_nunique_usr,video_id_nunique_usr,views_mean_usr,views_sum_usr,views_std_usr,views_count_usr,followers_mean_usr,followers_sum_usr,followers_std_usr
count,27734.0,27734.0,27734.0,27734.0,27734.0,27734.0,27734.0,27734.0,27734.0,27734.0
mean,13867.5,1.798875,3.617149,509.63252,1800.098832,128.891739,3.617149,251.409215,912.075791,21.791368
std,8006.260519,0.589336,1.451445,222.416486,955.171705,133.947188,1.451445,39.179037,390.636733,24.240826
min,1.0,1.0,2.0,44.0,88.0,0.0,2.0,160.0,320.0,0.0
25%,6934.25,1.0,3.0,329.333333,1182.0,0.0,3.0,230.0,650.0,0.0
50%,13867.5,2.0,3.0,514.333333,1753.0,92.953393,3.0,243.333333,840.0,17.320508
75%,20800.75,2.0,4.0,692.333333,2287.0,209.000797,4.0,276.0,1127.5,34.641016
max,27734.0,3.0,12.0,1000.0,8190.0,678.82251,12.0,360.0,3220.0,141.421356


In [545]:
user_profile.sample(3)

Unnamed: 0,user_id,category_id_nunique_usr,video_id_nunique_usr,views_mean_usr,views_sum_usr,views_std_usr,views_count_usr,followers_mean_usr,followers_sum_usr,followers_std_usr
4915,4916,3,5,499.8,2499,383.534483,5,266.0,1330,49.29503
973,974,1,2,819.0,1638,0.0,2,230.0,460,0.0
6871,6872,2,4,548.25,2193,159.5,4,290.0,1160,20.0


In [546]:
df.groupby(['user_id'])['age'].std().max()

0.0

In [547]:
df.groupby(['user_id'])['followers'].std().max()

141.4213562373095

In [548]:
cat_id = df.groupby(['category_id']).agg({'user_id':pd.Series.nunique,
                                          'video_id':pd.Series.nunique,
                                          'views':['mean','sum','count'],
                                          'followers':['mean','sum'],
                                        #   'engagement_score': cv,
                                          })

cat_id.columns = ["_".join(x) for x in cat_id.columns]
cat_id = cat_id.add_suffix("_cat")
cat_id = cat_id.reset_index()

cat_id.describe()


Unnamed: 0,category_id,user_id_nunique_cat,video_id_nunique_cat,views_mean_cat,views_sum_cat,views_count_cat,followers_mean_cat,followers_sum_cat
count,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0
mean,24.0,1061.489362,3.744681,516.425532,1062212.0,2134.425532,247.87234,538202.3
std,13.711309,730.989469,4.613157,299.915767,1114549.0,2078.421169,45.105488,534722.8
min,1.0,47.0,1.0,30.0,5010.0,47.0,160.0,12220.0
25%,12.5,529.0,2.0,238.5,236007.0,546.5,220.0,135195.0
50%,24.0,869.0,2.0,613.0,626184.0,1321.0,240.0,327040.0
75%,35.5,1588.5,4.5,793.5,1656089.0,2792.0,270.0,793860.0
max,47.0,3043.0,31.0,1000.0,4828064.0,9593.0,360.0,2302320.0


In [549]:
cat_id.sample(4)

Unnamed: 0,category_id,user_id_nunique_cat,video_id_nunique_cat,views_mean_cat,views_sum_cat,views_count_cat,followers_mean_cat,followers_sum_cat
7,8,3043,10,628.0,4828064,7688,280.0,2152640
46,47,47,1,248.0,11656,47,260.0,12220
30,31,739,7,156.0,369564,2369,230.0,544870
39,40,458,1,900.0,412200,458,250.0,114500


In [550]:
video_id = df.groupby(['video_id']).agg({'user_id':pd.Series.nunique,
                                         'views':['mean','sum','count'],
                                         'followers':['mean','sum'],
                                        #  'engagement_score': cv,
                                         })
video_id.columns = ["_".join(x) for x in video_id.columns]
video_id = video_id.add_suffix("_vid")
video_id = video_id.reset_index()


video_id.describe()

Unnamed: 0,video_id,user_id_nunique_vid,views_mean_vid,views_sum_vid,views_count_vid,followers_mean_vid,followers_sum_vid
count,175.0,175.0,175.0,175.0,175.0,175.0,175.0
mean,88.0,573.245714,462.959389,285279.662857,573.245714,248.894282,144545.771429
std,50.662281,293.52797,282.121367,227969.966162,293.52797,42.481791,79718.247823
min,1.0,47.0,30.0,4928.0,47.0,160.0,12220.0
25%,44.5,313.0,229.0,51558.0,313.0,230.0,76015.0
50%,88.0,642.0,414.0,254848.0,642.0,240.0,144900.0
75%,131.5,787.0,689.0,476652.0,787.0,270.0,207500.0
max,175.0,1514.0,1000.0,967284.0,1514.0,360.0,433760.0


In [551]:
video_id.sample(3)

Unnamed: 0,video_id,user_id_nunique_vid,views_mean_vid,views_sum_vid,views_count_vid,followers_mean_vid,followers_sum_vid
61,62,726,317.0,230142,726,240.0,174240
138,139,720,317.0,228240,720,240.0,172800
155,156,338,52.0,17576,338,260.0,87880


In [552]:
usr_cat = df.groupby(['user_id','category_id']).agg({'video_id':pd.Series.nunique,
                                                     'views':['mean','sum','count'],
                                                     'followers':['mean','sum',],
                                                    # 'engagement_score': cv,
                                                     })
usr_cat.columns = ["_".join(x) for x in usr_cat.columns]
usr_cat = usr_cat.add_suffix("_usr_cat")
usr_cat = usr_cat.reset_index()


usr_cat.describe()

Unnamed: 0,user_id,category_id,video_id_nunique_usr_cat,views_mean_usr_cat,views_sum_usr_cat,views_count_usr_cat,followers_mean_usr_cat,followers_sum_usr_cat
count,49890.0,49890.0,49890.0,49890.0,49890.0,49890.0,49890.0,49890.0
mean,13860.321227,19.483664,2.010784,520.871177,1000.680317,2.010784,253.863299,507.025656
std,8003.431271,11.855247,1.0825,269.311005,666.77101,1.0825,47.628561,278.288983
min,1.0,1.0,1.0,30.0,30.0,1.0,160.0,160.0
25%,6931.25,9.0,1.0,309.0,462.0,1.0,230.0,280.0
50%,13844.0,18.0,2.0,613.0,884.0,2.0,240.0,460.0
75%,20792.75,28.0,2.0,714.0,1378.0,2.0,280.0,660.0
max,27734.0,47.0,11.0,1000.0,4200.0,11.0,360.0,2640.0


In [553]:
vid_cat = df.groupby(['video_id','category_id']).agg({'user_id':pd.Series.nunique,
                                                     'views':['mean','sum','count'],
                                                     'followers':['mean','sum',],
                                                    # 'engagement_score': cv,
                                                     })
vid_cat.columns = ["_".join(x) for x in vid_cat.columns]
vid_cat = vid_cat.add_suffix("_usr_cat")
vid_cat = vid_cat.reset_index()


vid_cat.describe()

Unnamed: 0,video_id,category_id,user_id_nunique_usr_cat,views_mean_usr_cat,views_sum_usr_cat,views_count_usr_cat,followers_mean_usr_cat,followers_sum_usr_cat
count,176.0,176.0,176.0,176.0,176.0,176.0,176.0,176.0
mean,87.505682,18.676136,569.988636,464.420455,283658.755682,569.988636,249.204545,143724.488636
std,50.941198,12.955868,284.774156,283.894959,223123.732137,284.774156,43.113713,76840.120865
min,1.0,1.0,47.0,30.0,4928.0,47.0,160.0,12220.0
25%,43.75,6.75,315.0,229.0,51831.0,315.0,230.0,76417.5
50%,87.5,16.0,643.0,414.0,257171.0,643.0,240.0,145425.0
75%,131.25,29.25,786.5,689.0,470372.0,786.5,270.0,207250.0
max,175.0,47.0,1337.0,1000.0,819581.0,1337.0,360.0,352960.0


In [554]:
all_cat = df.groupby(['user_id','video_id','category_id']).agg({
                                                        'views':['mean','sum','count'],
                                                     'followers':['mean','sum',],
                                                     #'engagement_score': cv,
                                                     })
all_cat.columns = ["_".join(x) for x in all_cat.columns]
all_cat = all_cat.add_suffix("_usr_cat")
all_cat = all_cat.reset_index()


all_cat.describe()

Unnamed: 0,user_id,video_id,category_id,views_mean_usr_cat,views_sum_usr_cat,views_count_usr_cat,followers_mean_usr_cat,followers_sum_usr_cat
count,100318.0,100318.0,100318.0,100318.0,100318.0,100318.0,100318.0,100318.0
mean,13875.67935,77.94011,18.029157,497.656861,497.656861,1.0,252.153253,252.153253
std,8005.079041,48.499456,11.562197,266.974474,266.974474,0.0,45.32458,45.32458
min,1.0,1.0,1.0,30.0,30.0,1.0,160.0,160.0
25%,6938.25,35.0,8.0,229.0,229.0,1.0,230.0,230.0
50%,13889.0,76.0,16.0,467.0,467.0,1.0,240.0,240.0
75%,20813.0,121.0,26.0,709.0,709.0,1.0,280.0,280.0
max,27734.0,175.0,47.0,1000.0,1000.0,1.0,360.0,360.0


In [555]:
raw_train.head(3)

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
0,1,19990,37,128,24,Male,Student,180,1000,4.33
1,2,5304,32,132,14,Female,Student,330,714,1.79
2,3,1840,12,24,19,Male,Student,180,138,4.35


In [556]:
featured_df = df.copy()
featured_df = featured_df.merge(user_profile,how='left',on=['user_id'])
featured_df = featured_df.merge(cat_id,how='left',on=['category_id'])
featured_df = featured_df.merge(video_id,how='left',on=['video_id'])
featured_df = featured_df.merge(usr_cat,how='left',on=['user_id','category_id'])
featured_df = featured_df.merge(vid_cat,how='left',on=['video_id','category_id'])
featured_df = featured_df.merge(all_cat,how='left',on=['user_id','category_id','video_id'])

featured_df.head(3)

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score,category_id_nunique_usr,video_id_nunique_usr,views_mean_usr,views_sum_usr,views_std_usr,views_count_usr,followers_mean_usr,followers_sum_usr,followers_std_usr,user_id_nunique_cat,video_id_nunique_cat,views_mean_cat,views_sum_cat,views_count_cat,followers_mean_cat,followers_sum_cat,user_id_nunique_vid,views_mean_vid,views_sum_vid,views_count_vid,followers_mean_vid,followers_sum_vid,video_id_nunique_usr_cat,views_mean_usr_cat_x,views_sum_usr_cat_x,views_count_usr_cat_x,followers_mean_usr_cat_x,followers_sum_usr_cat_x,user_id_nunique_usr_cat,views_mean_usr_cat_y,views_sum_usr_cat_y,views_count_usr_cat_y,followers_mean_usr_cat_y,followers_sum_usr_cat_y,views_mean_usr_cat,views_sum_usr_cat,views_count_usr_cat,followers_mean_usr_cat,followers_sum_usr_cat
0,1,19990,37,128,24,Male,Student,180,1000,4.33,2,3,744.666667,2234,221.125153,3,246.666667,740,57.735027,635,2,1000.0,962000,962,180.0,173160,503,1000.0,503000,503,180.0,90540,1,1000.0,1000,1,180.0,180,503,1000.0,503000,503,180.0,90540,1000.0,1000,1,180.0,180
1,2,5304,32,132,14,Female,Student,330,714,1.79,3,5,602.8,3014,130.273558,5,308.0,1540,34.928498,1507,4,714.0,2239818,3137,330.0,1035210,807,714.0,576198,807,330.0,266310,2,714.0,1428,2,330.0,660,807,714.0,576198,807,330.0,266310,714.0,714,1,330.0,330
2,3,1840,12,24,19,Male,Student,180,138,4.35,1,4,138.0,552,0.0,4,180.0,720,0.0,2020,6,138.0,608856,4412,180.0,794160,644,138.0,88872,644,180.0,115920,4,138.0,552,4,180.0,720,644,138.0,88872,644,180.0,115920,138.0,138,1,180.0,180


In [557]:
featured_df['profession'].unique()

array(['Student', 'Working Professional', 'Other'], dtype=object)

In [558]:
featured_df['male'] = (featured_df['gender'] == 'Male').astype(int)
featured_df['student'] = (featured_df['profession'] == 'Student').astype(int)
featured_df['working'] = (featured_df['profession'] == 'Working Professional').astype(int)
featured_df.sample(3)

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score,category_id_nunique_usr,video_id_nunique_usr,views_mean_usr,views_sum_usr,views_std_usr,views_count_usr,followers_mean_usr,followers_sum_usr,followers_std_usr,user_id_nunique_cat,video_id_nunique_cat,views_mean_cat,views_sum_cat,views_count_cat,followers_mean_cat,followers_sum_cat,user_id_nunique_vid,views_mean_vid,views_sum_vid,views_count_vid,followers_mean_vid,followers_sum_vid,video_id_nunique_usr_cat,views_mean_usr_cat_x,views_sum_usr_cat_x,views_count_usr_cat_x,followers_mean_usr_cat_x,followers_sum_usr_cat_x,user_id_nunique_usr_cat,views_mean_usr_cat_y,views_sum_usr_cat_y,views_count_usr_cat_y,followers_mean_usr_cat_y,followers_sum_usr_cat_y,views_mean_usr_cat,views_sum_usr_cat,views_count_usr_cat,followers_mean_usr_cat,followers_sum_usr_cat,male,student,working
26381,26382,26302,40,124,24,Male,Student,250,900,4.56,2,3,511.333333,1534,336.595207,3,243.333333,730,5.773503,458,1,900.0,412200,458,250.0,114500,458,900.0,412200,458,250.0,114500,1,900.0,900,1,250.0,250,458,900.0,412200,458,250.0,114500,900.0,900,1,250.0,250,1,1,0
43430,43431,15477,19,45,17,Male,Student,230,369,4.23,3,6,392.833333,2357,39.726146,6,210.0,1260,24.494897,2258,7,369.0,2130975,5775,230.0,1328250,908,369.0,335052,908,230.0,208840,3,369.0,1107,3,230.0,690,908,369.0,335052,908,230.0,208840,369.0,369,1,230.0,230,1,1,0
55638,55639,3932,22,52,32,Female,Working Professional,260,709,2.24,2,3,695.666667,2087,11.547005,3,280.0,840,17.320508,366,1,709.0,259494,366,260.0,95160,366,709.0,259494,366,260.0,95160,1,709.0,709,1,260.0,260,366,709.0,259494,366,260.0,95160,709.0,709,1,260.0,260,0,0,1


In [559]:
train = featured_df[featured_df['engagement_score'].notna()].copy()
test = featured_df[featured_df['engagement_score'].isna()].copy()
train.shape[0] == raw_train.shape[0], test.shape[0] == raw_test.shape[0]

(True, True)

In [560]:
train.sample(3)

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score,category_id_nunique_usr,video_id_nunique_usr,views_mean_usr,views_sum_usr,views_std_usr,views_count_usr,followers_mean_usr,followers_sum_usr,followers_std_usr,user_id_nunique_cat,video_id_nunique_cat,views_mean_cat,views_sum_cat,views_count_cat,followers_mean_cat,followers_sum_cat,user_id_nunique_vid,views_mean_vid,views_sum_vid,views_count_vid,followers_mean_vid,followers_sum_vid,video_id_nunique_usr_cat,views_mean_usr_cat_x,views_sum_usr_cat_x,views_count_usr_cat_x,followers_mean_usr_cat_x,followers_sum_usr_cat_x,user_id_nunique_usr_cat,views_mean_usr_cat_y,views_sum_usr_cat_y,views_count_usr_cat_y,followers_mean_usr_cat_y,followers_sum_usr_cat_y,views_mean_usr_cat,views_sum_usr_cat,views_count_usr_cat,followers_mean_usr_cat,followers_sum_usr_cat,male,student,working
38081,38082,27071,5,89,25,Male,Working Professional,240,229,4.01,1,9,229.0,2061,0.0,9,240.0,2160,0.0,2638,31,229.0,2196797,9593,240.0,2302320,201,229.0,46029,201,240.0,48240,9,229.0,2061,9,240.0,2160,201,229.0,46029,201,240.0,48240,229.0,229,1,240.0,240,1,0,1
4594,4595,21516,13,27,22,Female,Working Professional,340,628,2.06,2,4,383.0,1532,282.901632,4,260.0,1040,92.376043,1674,5,628.0,1829364,2913,340.0,990420,616,628.0,386848,616,340.0,209440,2,628.0,1256,2,340.0,680,616,628.0,386848,616,340.0,209440,628.0,628,1,340.0,340,0,0,1
22147,22148,3596,16,137,15,Male,Student,270,462,4.25,3,7,384.0,2688,214.347848,7,280.0,1960,26.457513,1670,5,462.0,1833216,3968,270.0,1071360,779,462.0,359898,779,270.0,210330,4,462.0,1848,4,270.0,1080,779,462.0,359898,779,270.0,210330,462.0,462,1,270.0,270,1,1,0


In [565]:
drop_col = [
            'row_id',
            # 'user_id','category_id','video_id',
            'gender','profession',
            #'usr_rank',
            #'cat_rank','video_rank',
]

In [566]:
sub_train = train.drop(drop_col,axis=1)
sub_train.head(3)

Unnamed: 0,user_id,category_id,video_id,age,followers,views,engagement_score,category_id_nunique_usr,video_id_nunique_usr,views_mean_usr,views_sum_usr,views_std_usr,views_count_usr,followers_mean_usr,followers_sum_usr,followers_std_usr,user_id_nunique_cat,video_id_nunique_cat,views_mean_cat,views_sum_cat,views_count_cat,followers_mean_cat,followers_sum_cat,user_id_nunique_vid,views_mean_vid,views_sum_vid,views_count_vid,followers_mean_vid,followers_sum_vid,video_id_nunique_usr_cat,views_mean_usr_cat_x,views_sum_usr_cat_x,views_count_usr_cat_x,followers_mean_usr_cat_x,followers_sum_usr_cat_x,user_id_nunique_usr_cat,views_mean_usr_cat_y,views_sum_usr_cat_y,views_count_usr_cat_y,followers_mean_usr_cat_y,followers_sum_usr_cat_y,views_mean_usr_cat,views_sum_usr_cat,views_count_usr_cat,followers_mean_usr_cat,followers_sum_usr_cat,male,student,working
0,19990,37,128,24,180,1000,4.33,2,3,744.666667,2234,221.125153,3,246.666667,740,57.735027,635,2,1000.0,962000,962,180.0,173160,503,1000.0,503000,503,180.0,90540,1,1000.0,1000,1,180.0,180,503,1000.0,503000,503,180.0,90540,1000.0,1000,1,180.0,180,1,1,0
1,5304,32,132,14,330,714,1.79,3,5,602.8,3014,130.273558,5,308.0,1540,34.928498,1507,4,714.0,2239818,3137,330.0,1035210,807,714.0,576198,807,330.0,266310,2,714.0,1428,2,330.0,660,807,714.0,576198,807,330.0,266310,714.0,714,1,330.0,330,0,1,0
2,1840,12,24,19,180,138,4.35,1,4,138.0,552,0.0,4,180.0,720,0.0,2020,6,138.0,608856,4412,180.0,794160,644,138.0,88872,644,180.0,115920,4,138.0,552,4,180.0,720,644,138.0,88872,644,180.0,115920,138.0,138,1,180.0,180,1,1,0


In [563]:
# corr = sub_train.corr().round(2)
# plt.figure(figsize=(15,8))
# sns.heatmap(corr,annot=False)

In [567]:
x_train = sub_train.drop(['engagement_score'],axis=1)
y_train = sub_train['engagement_score']

scaler = StandardScaler()
train_scaled = scaler.fit_transform(x_train)
x_train = pd.DataFrame(train_scaled,index=x_train.index,columns=x_train.columns)
x_train.head(3)

Unnamed: 0,user_id,category_id,video_id,age,followers,views,category_id_nunique_usr,video_id_nunique_usr,views_mean_usr,views_sum_usr,views_std_usr,views_count_usr,followers_mean_usr,followers_sum_usr,followers_std_usr,user_id_nunique_cat,video_id_nunique_cat,views_mean_cat,views_sum_cat,views_count_cat,followers_mean_cat,followers_sum_cat,user_id_nunique_vid,views_mean_vid,views_sum_vid,views_count_vid,followers_mean_vid,followers_sum_vid,video_id_nunique_usr_cat,views_mean_usr_cat_x,views_sum_usr_cat_x,views_count_usr_cat_x,followers_mean_usr_cat_x,followers_sum_usr_cat_x,user_id_nunique_usr_cat,views_mean_usr_cat_y,views_sum_usr_cat_y,views_count_usr_cat_y,followers_mean_usr_cat_y,followers_sum_usr_cat_y,views_mean_usr_cat,views_sum_usr_cat,views_count_usr_cat,followers_mean_usr_cat,followers_sum_usr_cat,male,student,working
0,0.762983,1.599668,1.037451,-0.094759,-1.572002,1.850629,0.127028,-0.662592,1.140598,0.242153,0.612823,-0.662592,-0.15567,-0.674876,1.44758,-1.371838,-0.647159,1.850629,-0.658821,-1.142798,-1.572002,-1.240651,-0.808649,1.870338,0.589774,-0.808649,-1.595095,-1.238268,-1.016455,1.850629,-0.199301,-1.016455,-1.572002,-1.22002,-0.822112,1.850629,0.651489,-0.822112,-1.572002,-1.309602,1.850629,1.850629,0.0,-1.572002,-1.572002,0.838051,0.999115,-0.49789
1,-1.071497,1.171406,1.119977,-1.211393,1.682203,0.785722,1.774618,0.543186,0.473025,1.010811,-0.079251,0.543186,1.544197,1.153583,0.481294,-0.223567,-0.395254,0.785722,0.322796,-0.315185,1.682203,0.053194,0.329429,0.794599,0.913102,0.329429,1.709164,1.150807,-0.324912,0.785722,0.404606,-0.324912,1.682203,0.117317,0.395625,0.785722,0.990522,0.395625,1.682203,1.307286,0.785722,0.785722,0.0,1.682203,1.682203,-1.193244,0.999115,-0.49789
2,-1.504197,-0.541643,-1.108233,-0.653076,-1.572002,-1.358987,-1.520563,-0.059703,-1.714156,-1.41539,-1.071628,-0.059703,-2.003352,-0.720588,-0.998587,0.451963,-0.143349,-1.358987,-0.930105,0.169968,-1.572002,-0.308596,-0.280791,-1.371923,-1.239499,-0.280791,-1.595095,-0.893302,1.058174,-1.358987,-0.831428,1.058174,-1.572002,0.284484,-0.257306,-1.358987,-1.266639,-0.257306,-1.572002,-0.931741,-1.358987,-1.358987,0.0,-1.572002,-1.572002,0.838051,0.999115,-0.49789


In [568]:
r2_scorer = make_scorer(r2_score)

In [569]:
def cross_validation(x_train,y_train,model,model_name,cv=5,scoring=r2_scorer):
    scores = cross_val_score(model, x_train, y_train, cv=5,scoring=scoring)
    print(f"{model_name}: mean r2_score {scores.mean():.2f} with a standard deviation of {scores.std():.2f}")
    return

In [570]:
dummy_model = DummyRegressor()
cross_validation(x_train,y_train,dummy_model,'Dummy Model')

Dummy Model: mean r2_score -0.00 with a standard deviation of 0.00


In [571]:
linear_model = LinearRegression()
cross_validation(x_train,y_train,linear_model,'Linear Model')

Linear Model: mean r2_score 0.29 with a standard deviation of 0.01


In [572]:
# svm_model = SVR(kernel='poly',degree=1)
# cross_validation(x_train,y_train,svm_model,'SVM Model')

In [573]:
knn_model = KNeighborsRegressor()
cross_validation(x_train,y_train,knn_model,'KNN Model')

KNN Model: mean r2_score 0.30 with a standard deviation of 0.00


In [574]:
rf_model =  RandomForestRegressor(max_depth=9, random_state=100)
cross_validation(x_train,y_train,rf_model,'Random Forest Model')

Random Forest Model: mean r2_score 0.37 with a standard deviation of 0.01


In [575]:
ada_boost_model =  AdaBoostRegressor(random_state=100)
cross_validation(x_train,y_train,ada_boost_model,'AdaBoot')

AdaBoot: mean r2_score 0.23 with a standard deviation of 0.02


In [576]:
gbr_model =  GradientBoostingRegressor(max_depth=5, random_state=100,subsample=0.9,learning_rate=0.05)
cross_validation(x_train,y_train,gbr_model,'Gradient boosting')

Gradient boosting: mean r2_score 0.36 with a standard deviation of 0.01


In [577]:
gbr_model =  HistGradientBoostingRegressor(max_depth=7, random_state=100,learning_rate=0.05)
cross_validation(x_train,y_train,gbr_model,'Gradient boosting')

Gradient boosting: mean r2_score 0.37 with a standard deviation of 0.01


In [578]:
xgb_model =  xgb.XGBRegressor(n_estimators=100,max_depth=11, random_state=100,learning_rate=0.05,
                              subsample=1,colsample_bytree =1,objective='reg:squarederror')
cross_validation(x_train,y_train,xgb_model,'XGB')

XGB: mean r2_score 0.41 with a standard deviation of 0.00


In [579]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.05,
    'feature_fraction': 0.95,
    'bagging_fraction': 0.95,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 15,
    "num_leaves": 128,  
    "max_bin": 512,
    "n_estimators": 200,
    "random_state":345
}
lgb_model = lgb.LGBMRegressor(**hyper_params)
cross_validation(x_train,y_train,lgb_model,'LGB')

LGB: mean r2_score 0.40 with a standard deviation of 0.00


In [580]:
model = xgb_model.fit(x_train,y_train)

In [581]:
x_test = test[x_train.columns].copy()
scaled_test = scaler.transform(x_test)
x_test = pd.DataFrame(scaled_test,index=x_test.index,columns=x_test.columns)
x_test.head(3)

Unnamed: 0,user_id,category_id,video_id,age,followers,views,category_id_nunique_usr,video_id_nunique_usr,views_mean_usr,views_sum_usr,views_std_usr,views_count_usr,followers_mean_usr,followers_sum_usr,followers_std_usr,user_id_nunique_cat,video_id_nunique_cat,views_mean_cat,views_sum_cat,views_count_cat,followers_mean_cat,followers_sum_cat,user_id_nunique_vid,views_mean_vid,views_sum_vid,views_count_vid,followers_mean_vid,followers_sum_vid,video_id_nunique_usr_cat,views_mean_usr_cat_x,views_sum_usr_cat_x,views_count_usr_cat_x,followers_mean_usr_cat_x,followers_sum_usr_cat_x,user_id_nunique_usr_cat,views_mean_usr_cat_y,views_sum_usr_cat_y,views_count_usr_cat_y,followers_mean_usr_cat_y,followers_sum_usr_cat_y,views_mean_usr_cat,views_sum_usr_cat,views_count_usr_cat,followers_mean_usr_cat,followers_sum_usr_cat,male,student,working
89197,-0.736479,-0.541643,-0.736865,-1.211393,-1.572002,-1.358987,-1.520563,-0.662592,-1.714156,-1.551384,-1.071628,-0.662592,-2.003352,-1.131991,-0.998587,0.451963,-0.143349,-1.358987,-0.930105,0.169968,-1.572002,-0.308596,1.916749,-1.371923,-0.881682,1.916749,-1.595095,0.542835,0.366631,-1.358987,-1.026146,0.366631,-1.572002,-0.217017,2.094049,-1.358987,-0.891441,2.094049,-1.572002,0.641339,-1.358987,-1.358987,0.0,-1.572002,-1.572002,0.838051,0.999115,-0.49789
89198,-0.325264,1.342711,0.769241,-1.211393,-0.487267,1.254877,1.774618,1.748965,1.145527,3.184735,-0.38534,1.748965,1.084916,2.29637,1.440581,0.002926,-0.269302,1.254877,1.169766,0.005206,-0.487267,-0.126973,0.258299,1.268526,1.29175,0.258299,-0.493675,-0.005469,0.366631,1.254877,1.945417,0.366631,-0.487267,0.200901,0.319517,1.254877,1.387562,0.319517,-0.487267,0.040752,1.254877,1.254877,0.0,-0.487267,-0.487267,0.838051,0.999115,-0.49789
89199,0.420095,-0.884253,0.666083,2.138509,0.597468,0.465505,0.127028,1.146076,0.591607,1.753848,-1.071628,1.146076,1.322476,1.747832,0.314164,1.799074,0.360461,0.465505,2.31108,1.416525,0.597468,1.730337,0.074859,0.471126,0.417911,0.074859,0.607744,0.343575,1.058174,0.465505,1.934129,1.058174,0.597468,1.398932,0.123237,0.465505,0.471277,0.123237,0.597468,0.423079,0.465505,0.465505,0.0,0.597468,0.597468,-1.193244,-1.000886,2.008476


In [582]:
x_test['engagement_score'] = model.predict(x_test)
x_test['engagement_score'].head(3)

89197    4.114050
89198    3.691869
89199    2.631723
Name: engagement_score, dtype: float32

In [584]:
test['engagement_score'] = x_test['engagement_score']
test[['row_id','engagement_score']].to_csv("submission_3_xgb.csv",index=False)