In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, scale
import matplotlib.pyplot as plt

In [2]:
# Load data
df = pd.read_csv('./USvideos_dateFormatted.csv')
print(df.shape)
df.head(3)

(40949, 22)


Unnamed: 0,video_id,trending_date,title,channel_title,publish_date,category_id,publish_time,tags,views,likes,...,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category,hour,min,sec,views in hour
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,2017-11-13,22,17:13:01,SHANtell martin,748374,57527,...,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,People & Blogs,17,13,1,1603781.0
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,2017-11-13,24,07:30:00,"last week tonight trump presidency|""last week ...",2418783,97185,...,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John...",Entertainment,7,30,0,4147581.0
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,2017-11-12,23,19:05:24,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,...,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...,Comedy,19,5,24,1467202.0


In [3]:
df = df[['views','dislikes','comment_count','likes']]

In [4]:
features = df[['views','dislikes','comment_count']]
target = df.likes

In [5]:
# creats extra features
poly = PolynomialFeatures(2,interaction_only=True,include_bias = False)
features_e = poly.fit_transform(features)
cols = poly.get_feature_names(features.columns)
features_e = pd.DataFrame(features_e,columns=cols)
print('before:   features shape',features.shape)
print('after： features_e shape',features_e.shape)
features_e.head(3)

before:   features shape (40949, 3)
after： features_e shape (40949, 6)


Unnamed: 0,views,dislikes,comment_count,views dislikes,views comment_count,dislikes comment_count
0,748374.0,2966.0,15954.0,2219677000.0,11939560000.0,47319564.0
1,2418783.0,6146.0,12703.0,14865840000.0,30725800000.0,78072638.0
2,3191434.0,5339.0,8181.0,17039070000.0,26109120000.0,43678359.0


In [6]:
# standardize features
features_e = (features_e - features_e.mean())/features_e.std()
features_e.head(3)

Unnamed: 0,views,dislikes,comment_count,views dislikes,views comment_count,dislikes comment_count
0,-0.218067,-0.025677,0.200564,-0.040953,-0.056854,-0.027302
1,0.007844,0.083866,0.113709,-0.036153,-0.050885,-0.026175
2,0.112339,0.056067,-0.007101,-0.035328,-0.052352,-0.027436


In [7]:
df = features_e.copy()
df['likes'] = target
df.head(3)

Unnamed: 0,views,dislikes,comment_count,views dislikes,views comment_count,dislikes comment_count,likes
0,-0.218067,-0.025677,0.200564,-0.040953,-0.056854,-0.027302,57527
1,0.007844,0.083866,0.113709,-0.036153,-0.050885,-0.026175,97185
2,0.112339,0.056067,-0.007101,-0.035328,-0.052352,-0.027436,146033


In [8]:
(df_train,df_test)  = train_test_split(df,
                                 train_size=0.8,
                                 test_size=0.2,
                                 random_state=0)

In [9]:
features_train = df_train.iloc[:,0:-1]
features_test  = df_test.iloc[:,0:-1]
target_train  = df_train.iloc[:,-1]
target_test   = df_test.iloc[:,-1]

In [10]:
# standardize features
features_train = (features_train - features_train.mean())/features_train.std()
features_test = (features_test - features_test.mean())/features_test.std()
features_train.head(3)

Unnamed: 0,views,dislikes,comment_count,views dislikes,views comment_count,dislikes comment_count
29635,-0.232314,0.999858,0.061187,-0.03222,-0.058308,-0.01673
31589,-0.137138,-0.065352,-0.102288,-0.039268,-0.058579,-0.029779
18135,-0.221837,-0.105902,-0.139659,-0.039976,-0.059812,-0.030015


In [11]:
# create bias column of ones
features_train['bias'] =1
features_test['bias'] = 1
features_train.head(3)

Unnamed: 0,views,dislikes,comment_count,views dislikes,views comment_count,dislikes comment_count,bias
29635,-0.232314,0.999858,0.061187,-0.03222,-0.058308,-0.01673,1
31589,-0.137138,-0.065352,-0.102288,-0.039268,-0.058579,-0.029779,1
18135,-0.221837,-0.105902,-0.139659,-0.039976,-0.059812,-0.030015,1


# Linear Regression

In [12]:
lr = LinearRegression()

In [13]:
# coefficients
lr.fit(features_train,target_train)
predict = lr.predict(features_test)

In [16]:
compare = pd.DataFrame({'Actual': target_test, 'Predicted': predict})
compare

Unnamed: 0,Actual,Predicted
8319,40908,50270.059276
8679,690283,459378.476533
30645,60849,134373.113235
777,10,4710.427671
5386,14,4858.307026
...,...,...
10767,239,5086.346236
26439,11686,85297.606260
11551,219626,127429.866214
9048,860,5862.906656


In [17]:
compare['difference'] = (compare.Actual - compare.Predicted).abs()

In [18]:
compare.sort_values(by=['difference'],ascending=True)

Unnamed: 0,Actual,Predicted,difference
7771,21281,2.127416e+04,6.844153e+00
4510,7570,7.580445e+03,1.044510e+01
22519,223803,2.237885e+05,1.454704e+01
23133,22442,2.245784e+04,1.584414e+01
4326,19041,1.902236e+04,1.863914e+01
...,...,...,...
39814,2488565,1.588021e+06,9.005444e+05
39191,2178332,1.260209e+06,9.181235e+05
39398,2280497,1.361692e+06,9.188054e+05
10000,835378,1.804518e+06,9.691400e+05
