In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures, scale
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
import matplotlib.pyplot as plt

In [2]:
# Load data
df_US = pd.read_csv('./USvideos.csv')
df_CA = pd.read_csv('./CAvideos.csv')
print(df_US.shape)
print(df_CA.shape)
df_US.head(3)

(40949, 16)
(40881, 16)


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...


In [3]:
#get rid some features
df_train = df_US.copy()[['views','likes','dislikes','comment_count']]
df_test = df_CA.copy()[['views','likes','dislikes','comment_count']]

In [4]:
features_train = df_train.drop('likes',axis=1)
features_test = df_test.drop('likes',axis=1)
target_train = df_train.likes
target_test = df_test.likes

In [5]:
# creats extra features to increase the accuracy of prediction
# poly = PolynomialFeatures(2,interaction_only=True,include_bias = False)
# features_e_train = poly.fit_transform(features_train)
# features_e_test = poly.fit_transform(features_test)
# cols_train = poly.get_feature_names(features_train.columns)
# cols_test = poly.get_feature_names(features_test.columns)
# features_e_train = pd.DataFrame(features_e_train,columns=cols_train)
# features_e_test = pd.DataFrame(features_e_test,columns=cols_test)
# print('before:   features_train shape',features_train.shape)
# print('after： features_e_train shape',features_e_train.shape)
# print('before:   features_test shape',features_test.shape)
# print('after： features_e_test shape',features_e_test.shape)
# features_e_train.head(3)

In [6]:
# # standardize features
# features_e_train = (features_e_train - features_e_train.mean())/features_e_train.std()
# features_e_test = (features_e_test - features_e_test.mean())/features_e_test.std()
# features_e_train.head(3)
# features_train = (features_train - features_train.mean())/features_train.std()
# features_test = (features_test - features_test.mean())/features_test.std()
# features_train.head(3)
# standardize
features_train = scale(features_train)
features_test = scale(features_test)

In [7]:
# create bias column of ones
# features_e_train['bias'] =1
# features_e_test['bias'] = 1
# features_e_train.head(3)

# KNN Regression

In [None]:
num_neighbors = []
R2_train = []
R2_test = []
for K in np.arange(1000)+1:
    knn = KNeighborsRegressor(n_neighbors=K)
    knn.fit(features_train, target_train)
    num_neighbors.append(K)
    R2_train.append(knn.score(features_train, target_train))
    R2_test.append(knn.score(features_test, target_test))

errors = pd.DataFrame()
errors['num_neighbors'] = num_neighbors
errors['train R2'] = R2_train
errors['test R2'] = R2_test
errors.head(10)

In [None]:
ax1 = errors.plot.line(x = 'num_neighbors', y = 'train R2')
errors.plot.line(x = 'num_neighbors', y = 'test R2', ax = ax1)
plt.ylabel('R2')

In [None]:
print('      Max test R-sqr = ', round(errors['test R2'].max(),2))
ix = errors['test R2'].idxmax()
print('optimal n_neighbors =',errors.num_neighbors[ix])

In [None]:
knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(features_e_train, target_train)

In [None]:
predict = knn.predict(features_e_test)

In [None]:
df_CA['predict_likes'] = predict.round(0)

In [None]:
df_CA.columns

In [None]:
df_compare = df_CA.copy()[['video_id','title', 'channel_title','publish_time','thumbnail_link', 'likes','predict_likes','dislikes', 'comment_count','category_id']]
df_compare.head(3)

In [None]:
df_compare['difference'] = (df_compare.likes - df_compare.predict_likes).abs()

## Top 10 prediction with lowest difference

In [None]:
df_compare.sort_values(by=['difference'],ascending=True).head(10)

## Top 10 prediction with highest difference

In [None]:
df_compare.sort_values(by=['difference'],ascending=False).head(10)

### The reasons behind the inaccuracy might be that:
1. We have poor features, the features we used did not have a high enough correlation to the values we are trying to predict, which is the number of likes in our experiement.
2. We need more training data. The more trainning will definitely help us to predict the target more accurately.