# Importing Packages and Data

In [24]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from scipy.sparse import csr_matrix



In [25]:
# Data taken from https://grouplens.org/datasets/movielens/latest/

Movie_df = pd.read_csv("movies.csv")
Rating_df = pd.read_csv("ratings.csv")

In [26]:
Rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [27]:
Movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# Preprocessing

In [28]:
df = pd.merge(Movie_df, Rating_df,on='movieId')

In [29]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [30]:
df_2_=df.dropna(axis=0,subset=["title"])

In [31]:
df_2_ = df_2_.groupby(by = ['title'])['rating'].count().reset_index().rename(columns={"rating":"TotalRating"})
df_2_.head(10)

Unnamed: 0,title,TotalRating
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
5,'Tis the Season for Love (2015),1
6,"'burbs, The (1989)",17
7,'night Mother (1986),1
8,(500) Days of Summer (2009),42
9,*batteries not included (1987),7


In [32]:
df_3_ = pd.merge(df, df_2_, on='title',how='left')
df_3_ = df_3_.drop(['genres','timestamp'], axis=1)
df_3_.tail()


Unnamed: 0,movieId,title,userId,rating,TotalRating
100831,193581,Black Butler: Book of the Atlantic (2017),184,4.0,1
100832,193583,No Game No Life: Zero (2017),184,3.5,1
100833,193585,Flint (2017),184,3.5,1
100834,193587,Bungo Stray Dogs: Dead Apple (2018),184,3.5,1
100835,193609,Andrew Dice Clay: Dice Rules (1991),331,4.0,1


In [33]:
pd.set_option("display.float_format",lambda x: "%.3f" % x)
df_3_.describe()

Unnamed: 0,movieId,userId,rating,TotalRating
count,100836.0,100836.0,100836.0,100836.0
mean,19435.296,326.128,3.502,58.759
std,35530.987,182.618,1.043,61.965
min,1.0,1.0,0.5,1.0
25%,1199.0,177.0,3.0,13.0
50%,2991.0,325.0,3.5,39.0
75%,8122.0,477.0,4.0,84.0
max,193609.0,610.0,5.0,329.0


In [34]:
threshold = 50

In [35]:
df_3_ = df_3_.query('TotalRating >= @threshold')
df_3_.tail()

Unnamed: 0,movieId,title,userId,rating,TotalRating
98310,122904,Deadpool (2016),561,2.0,54
98311,122904,Deadpool (2016),586,4.0,54
98312,122904,Deadpool (2016),596,4.0,54
98313,122904,Deadpool (2016),599,3.5,54
98314,122904,Deadpool (2016),610,3.0,54


In [36]:
pt = df_3_.groupby([ 'title','userId'])['rating'].sum().unstack().fillna(0)
FinalData = csr_matrix(pt.values)
pt.head()


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


# Making the Model

In [37]:
KNN = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
KNN.fit(FinalData)

# Implementation

In [38]:
def get_Rec(movie_index):
    _, indices = KNN.kneighbors(pt.iloc[movie_index,:].values.reshape(1, -1), n_neighbors = 5)
    return indices


In [39]:
movie_name = input("Enter Name of Movie \n P.S. - It should be present in training data")
formatted_movie_name = movie_name.lower().replace(" ", "")
print(formatted_movie_name)
row_names = pt.index

for i,k in enumerate(row_names):

    k = k.lower().replace(" ", "")

    comma = k.find(",")
    index_of_bracket = k.find("(")


    if comma != -1:
        c = k[comma+1:index_of_bracket]
        d = k[:comma]
        k = c+d
    else:
        if formatted_movie_name.find("(") == -1:
            k = k[:index_of_bracket]
        

    if k == formatted_movie_name:
        movie_index = i
        break

print(movie_index)


deadpool
121


In [40]:
rec = get_Rec(movie_index)

In [41]:
for i,k in enumerate(rec[0]):

    name = row_names[k]
    comma = name.find(",")

    if comma != -1:
        index_of_bracket = name.find("(")
        c = name[comma+2:index_of_bracket]
        d = name[:comma]
        name = c+d+" "+name[index_of_bracket:]

    if  i == 0:
        print(f"Recommendations for {name}:\n")
    else:
        print(f"{i}. {name}")

Recommendations for Deadpool (2016):

1. Guardians of the Galaxy (2014)
2. Interstellar (2014)
3. The Avengers (2012)
4. The Wolf of Wall Street (2013)
