# Part 2 - Implementation 

In [1]:
import pandas as  pd
import numpy as np

from sklearn.model_selection import train_test_split as TTS2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
mov_df = pd.read_csv('movie_genres.csv').drop("Unnamed: 0",axis=1)
mov_df['movie_id']= pd.factorize(mov_df['movie_title'])[0]

rev_df = pd.read_csv('user_reviews.csv')
rev_df['user_id']= pd.factorize(rev_df['User'])[0]

df_melt = rev_df.melt(id_vars=['user_id','User'], value_vars=rev_df.columns.difference(['User','user_id']))
df_melt.rename(columns = {'variable':'movie_title'}, inplace=True)

df_merge = pd.merge(df_melt, mov_df, on='movie_title')#.drop('movie_title',axis=1)
df_merge.insert(1, 'movie_id', df_merge.pop('movie_id'))

In [3]:
print(df_merge)

         user_id  movie_id     User  movie_title  value  genre_action  \
0              0       440  Vincent  10th & Wolf    0.0             0   
1              1       440    Edgar  10th & Wolf    0.0             0   
2              2       440  Addilyn  10th & Wolf    0.0             0   
3              3       440   Marlee  10th & Wolf    0.0             0   
4              4       440   Javier  10th & Wolf    0.0             0   
...          ...       ...      ...          ...    ...           ...   
1199995      595      1922  Mariana     Æon Flux    0.0             1   
1199996      596      1922      Ivy     Æon Flux    0.0             1   
1199997      597      1922    Kevin     Æon Flux    0.0             1   
1199998      598      1922     Nora     Æon Flux    0.0             1   
1199999      599      1922    Sarai     Æon Flux    0.0             1   

         genre_adventure  genre_animation  genre_biography  genre_comedy  ...  \
0                      0                0 

In [4]:
##Find unique movies
df_merge['movie_id'].nunique()

2000

In [5]:
##Number of reviews
##top movies 20, lowest 1.

df_merge.loc[df_merge['value']>0]['movie_id'].value_counts()

143     20
1894    20
1178    20
1944    19
1666    19
        ..
1304     1
958      1
774      1
604      1
388      1
Name: movie_id, Length: 2000, dtype: int64

In [6]:
#Dataframe with number of reviews
rev_count= pd.DataFrame(df_merge.loc[df_merge['value']>0]['movie_id'].value_counts())

#Find the movies with less than 5 reviews
rare_movies = rev_count[rev_count['movie_id'] < 2].index

#Get the movies with 10 or more reviews
common_movies = df_merge[~df_merge['movie_id'].isin(rare_movies)]

common_movies_rated = common_movies.loc[common_movies['value']>0]

common_movies.shape


(1195200, 30)

In [13]:
df_merge


Unnamed: 0,user_id,movie_id,User,movie_title,value,genre_action,genre_adventure,genre_animation,genre_biography,genre_comedy,...,genre_mystery,genre_news,genre_reality-tv,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_war,genre_western
0,0,440,Vincent,10th & Wolf,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,440,Edgar,10th & Wolf,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2,440,Addilyn,10th & Wolf,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3,440,Marlee,10th & Wolf,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,4,440,Javier,10th & Wolf,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,595,1922,Mariana,Æon Flux,0.0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1199996,596,1922,Ivy,Æon Flux,0.0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1199997,597,1922,Kevin,Æon Flux,0.0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1199998,598,1922,Nora,Æon Flux,0.0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [7]:
# Unique movies with more than 1000 reviews:
common_movies['movie_id'].nunique()

1992

In [8]:
#Create pivot tables with user scores
user_movie_df = common_movies.pivot_table(index=['user_id'], columns=['movie_id'], values='value')

user_movie_df.replace(0, np.nan, inplace=True)

user_movie_df.shape

(600, 1992)

In [9]:
def get_row(df, user_id, movie_id):
    for i in movie_id:
        print(df[(df['user_id'] == user_id) & (df['movie_id'] == i)])
    return 

In [10]:
for i in range(5):
    #Get user info
    random_user_df = user_movie_df[user_movie_df.index == i]
    
    #Get all movies that the user watched
    movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist()
    
    #Select movies in pivot
    movies_watched_df= user_movie_df[movies_watched]

    ## Get numberof movies
    user_movie_count = movies_watched_df.T.notnull().sum().reset_index()
    user_movie_count.columns = ['user_id', 'movie_count']

    #People who have watched more than 50% movies together with user i:
    users_same_movies = user_movie_count[user_movie_count['movie_count']>(0.05*len(movies_watched))]['user_id']
    
    #Combine the data of user i and all similar users
    final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)],random_user_df[movies_watched]])

    corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
    corr_df = pd.DataFrame(corr_df, columns=["corr"])
    corr_df.index.names = ['user_id_1', 'user_id_2']
    corr_df = corr_df.reset_index()

    top_users = corr_df[(corr_df["user_id_1"] == i) & (corr_df["corr"] >= 0)][
    ["user_id_2", "corr"]].reset_index(drop=True)
    
    print("similar users:"+str(len(top_users)))

    top_users = top_users.sort_values(by='corr', ascending=False)
    top_users.rename(columns={"user_id_2": "user_id"}, inplace=True)

    top_users_ratings = top_users.merge(common_movies_rated[["user_id", "movie_id", "value"]], how='inner')



    top_users_ratings = top_users_ratings[top_users_ratings["user_id"] != i]
    
    top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['value']
    top_users_ratings.groupby('movie_id').agg({"weighted_rating": "mean"})

    recommendation_df = top_users_ratings.groupby('movie_id').agg({"weighted_rating": "mean"})
    recommendation_df = recommendation_df.reset_index().sort_values("weighted_rating", ascending=False)

    get_row(df_merge.iloc[:, : 5],i,recommendation_df['movie_id'].head(7))
    
    print(recommendation_df.head(7))








similar users:3
        user_id  movie_id     User movie_title  value
585600        0      1850  Vincent     Minions    0.0
        user_id  movie_id     User               movie_title  value
612000        0      1644  Vincent  My Best Friend's Wedding    0.0
         user_id  movie_id     User movie_title  value
1174200        0      1759  Vincent   Wild Card    0.0
        user_id  movie_id     User movie_title  value
784200        0      1595  Vincent    Shopgirl    3.0
        user_id  movie_id     User   movie_title  value
162000        0       223  Vincent  Broken Arrow    3.0
        user_id  movie_id     User       movie_title  value
432600        0       968  Vincent  How to Be Single    0.0
        user_id  movie_id     User      movie_title  value
469200        0       562  Vincent  Isn't She Great    0.0
    movie_id  weighted_rating
74      1850              5.0
61      1644              5.0
67      1759              5.0
56      1595              5.0
11       223          

Test KNN on cleaned data

In [11]:
data_pred = common_movies.loc[common_movies['value']==0]
X_pred = data_pred.drop('value', axis=1)
Y_pred = data_pred['value']

data_rated = common_movies.loc[common_movies['value']>0]

X = data_rated.drop('value', axis=1)
Y = data_rated['value']

Xtrain, Xtest, Ytrain, Ytest = TTS2(X,Y, test_size= .2, random_state=42)

clf = KNeighborsClassifier(n_neighbors=500)
clf.fit(Xtrain,Ytrain)

pred = clf.predict(Xtest)
exact_acc = accuracy_score(Ytest,pred)

bin_rev_real = np.where(Ytest >= 3,1,0)
bin_rev_pred = np.where(pred >=3,1,0)

bin_acc =  sum(a == b for a, b in zip(bin_rev_real, bin_rev_pred)) / len(bin_rev_real)

print(exact_acc)
print(bin_acc)

ValueError: could not convert string to float: 'Lincoln'

In [25]:
for i in range(5):
    unseen_movies = X_pred.loc[X_pred['user_id']==i].copy().reset_index(drop=True)
    unseen_movies.insert(2,'score', clf.predict(unseen_movies))
    unseen_movies.sort_values(by = ['score'], ascending=False, inplace=True)
    print('Recommendations for user '+str(i))
    print(pd.DataFrame(unseen_movies[['user_id','movie_id','score']]).head())
    get_row(df_merge.iloc[:, : 5],i,unseen_movies['movie_id'].head())



Recommendations for user 0
      user_id  movie_id  score
913         0       220    5.0
1272        0      1469    5.0
1644        0       117    5.0
410         0        77    5.0
1279        0       150    5.0
        user_id  movie_id     User movie_title  value
556200        0       220  Vincent     Madison    0.0
        user_id  movie_id     User        movie_title  value
777600        0      1469  Vincent  Shaun of the Dead    0.0
         user_id  movie_id     User  movie_title  value
1009800        0       117  Vincent  The Misfits    0.0
        user_id  movie_id     User movie_title  value
251400        0        77  Vincent  Die Hard 2    0.0
        user_id  movie_id     User        movie_title  value
782400        0       150  Vincent  Shinjuku Incident    0.0
Recommendations for user 1
     user_id  movie_id  score
713        1       191    5.0
927        1      1453    5.0
345        1      1369    5.0
344        1        28    5.0
778        1      1291    5.0
        

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=430916cd-0b84-4120-a50c-61096e15b16a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>