In [1]:
#to change and work with dataframes
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


<h2>Content based Filtering</h2>

reading and entering csv files with pandas as dataframe

In [2]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

using head function to analyze and see the dataframe. default shows 5 rows.

In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


Seprating years of movies from title and creating and moving them to year column

In [5]:
#creating year column in movie dataframe and using extract function with regex on title column to copy the years with parentheses to year column
movies_df['year'] = movies_df.title.str.extract(r'(\(\d\d\d\d\))',expand=False)
#same method and function but we choose year column this time and choose the years without their parentheses
movies_df['year'] = movies_df.year.str.extract(r'(\d\d\d\d)',expand=False)
#using replace function in title to choose the years with parentheses and replace them with nothing, in order to delete them.
movies_df['title'] = movies_df['title'].str.replace(r'\(\d{4}\)', '', regex=True)
#using strip function to remove any extra space in beginning and end of movies titles
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


making genres an array

In [6]:
#in genres column, genres are seprated with pipe line so we use split function to seprate a string between | and make each an index of an array
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [7]:
#Copying movie dataframe to new dataframe to not change the first one
moviesWithGenres_df = movies_df.copy()

#for every index in every row (movie) iterrates 
for index, row in movies_df.iterrows():
    #for every index in genres array, make a column if not existed named that genre and write 1 for its value
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1
#naturally there are rows that doesnt have value (Nan) cause they didnt have that genre so, we fill them instead with 0 
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


using drop function to delete timestamp column cause it doesnt have any use

In [9]:
ratings_df = ratings_df.drop('timestamp', axis=1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


User input rating matrix

In [10]:
#making a costume user input rating 
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
#making a dataframe from user input rating
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


finding the id of movies person saw in our movie matrix

In [11]:
#finding and choosing the movies that have same movie name as our input in movies dataframe and copying them to inputid variable
inputId = movies_df[movies_df['title'].isin(inputMovies['title'])]
#merging inputid(movies that the person saw) with his ratings, making the user rating matrix
inputMovies = pd.merge(inputId, inputMovies)
#deleting columns we dont need from our new dataframe
inputMovies = inputMovies.drop('genres', axis=1).drop('year', axis=1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0
5,164600,Akira,4.5


In [12]:
inputId.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
292,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994
1241,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988
1879,1968,"Breakfast Club, The","[Comedy, Drama]",1985


keeping the movies the person saw in dataframe that we created containing all genres as 1 and 0. Making the final movies and genere matrix

In [13]:
userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies['movieId'])]
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
292,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1241,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1879,1968,"Breakfast Club, The","[Comedy, Drama]",1985,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42774,164600,Akira,"[Action, Crime, Thriller]",2016,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


cleaning the dataframe

In [14]:
#cleaning and making new set of index to use it easier
userMovies = userMovies.reset_index(drop=True)
#deleting columns we dont need to save memory by drop function and columns
userGenreTable = userMovies.drop('movieId', axis=1).drop('title', axis=1).drop('genres', axis=1).drop('year', axis=1)
userGenreTable

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
inputMovies['rating']

0    3.5
1    2.0
2    5.0
3    4.5
4    5.0
5    4.5
Name: rating, dtype: float64

transpose the movies matrix and multiply it with user rating (matrix multiply). Making user profile

In [16]:
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])
userProfile

Adventure             10.0
Animation              8.0
Children               5.5
Comedy                13.5
Fantasy                5.5
Romance                0.0
Drama                 10.0
Action                 9.0
Crime                  9.5
Thriller               9.5
Horror                 0.0
Mystery                0.0
Sci-Fi                 4.5
IMAX                   0.0
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

taking all movies and their genre 

In [17]:
#creating genretable and choosing every movie in movies with genres dataframe but setting new index to them as their movieid
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])
#deleting the extra columns to save memory using drop function
genreTable = genreTable.drop('movieId', axis=1).drop('title', axis=1).drop('genres', axis=1).drop('year', axis=1)
genreTable.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
genreTable.shape

(62423, 20)

Multiply the genres with user profile to achieve weighted matrix and then take the weighted average to have a single score for every movie

In [19]:
#multiply each genre with its rating from user and summation them and divide them by the summation of userprofile
recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable_df.head()

movieId
1    0.500000
2    0.247059
3    0.158824
4    0.276471
5    0.158824
dtype: float64

Sorting

In [20]:
#sort decending with sort value function based on score
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
recommendationTable_df.head()

movieId
5018      0.735294
144324    0.723529
122787    0.723529
81132     0.723529
64645     0.723529
dtype: float64

recommendation matrix

In [21]:
movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(10).keys())]

Unnamed: 0,movieId,title,genres,year
4614,4719,Osmosis Jones,"[Action, Animation, Comedy, Crime, Drama, Roma...",2001
4912,5018,Motorama,"[Adventure, Comedy, Crime, Drama, Fantasy, Mys...",1991
6779,6902,Interstate 60,"[Adventure, Comedy, Drama, Fantasy, Mystery, S...",2002
9177,27344,Revolutionary Girl Utena: Adolescence of Utena...,"[Action, Adventure, Animation, Comedy, Drama, ...",1999
12879,64645,The Wrecking Crew,"[Action, Adventure, Comedy, Crime, Drama, Thri...",1968
15389,81132,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film...",2010
23306,117646,Dragonheart 2: A New Beginning,"[Action, Adventure, Comedy, Drama, Fantasy, Th...",2000
25008,122787,The 39 Steps,"[Action, Adventure, Comedy, Crime, Drama, Thri...",1959
33940,144324,Once Upon a Time,"[Action, Adventure, Comedy, Crime, Drama, Roma...",2008
36115,149488,Christmas Town,"[Action, Children, Comedy, Drama, Fantasy, Thr...",2008


<h2>Collaborative Filtering</h2>

reading and entering csv files with pandas as dataframe

In [22]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

Seprating years of movies from title and creating and moving them to year column

In [23]:
#creating year column in movie dataframe and using extract function with regex in title to copy the years with parentheses to year column
movies_df['year'] = movies_df.title.str.extract(r'(\(\d\d\d\d\))',expand=False)
#same method and function but we choose year column this time and choose the years without their parentheses
movies_df['year'] = movies_df.year.str.extract(r'(\d\d\d\d)',expand=False)
#using replace function in title to choose the years with parentheses and replace them with nothing, in order to delete them.
movies_df['title'] = movies_df['title'].str.replace(r'\(\d{4}\)', '', regex=True)
#using strip function to remove any extra space in beginning and end of movies titles
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [24]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [25]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [26]:
#using drop function to delete timestamp column from dataframe to save memory cause we dont need it
ratings_df = ratings_df.drop('timestamp', axis=1)
#Dropping the genres column
movies_df = movies_df.drop('genres', axis=1)

In [27]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


<h2>User Based recommendation system</h2> 

Making costume user input and making it a dataframe(matrix)

In [28]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


finding the movies the person saw based on name in movie dataframe and keeping them with their ratings

In [29]:
#finding and choosing the movies that have same movie name as our input in movies dataframe and copying them to inputid
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#merging inputid(movies that the person saw) with his ratings, making the user rating matrix
inputMovies = pd.merge(inputId, inputMovies)
#deleting columns we dont need from our new dataframe
inputMovies = inputMovies.drop('year', axis=1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0
5,164600,Akira,4.5


making a dataframe of every ratings that the movies have gotten, rating matrix

In [30]:
#Filtering out users that have watched movies that the input has watched and storing it 
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head() 

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
70,2,1,3.5
141,2,1968,1.0
254,3,1,4.0
264,3,296,5.0


making every person a subset group containing his ratings of movies

In [31]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby('userId')
userSubsetGroup

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001B827C7BD70>

In [32]:
userSubsetGroup.get_group(1)

Unnamed: 0,userId,movieId,rating
0,1,296,5.0


In [33]:
#sorting decending based on how many movies our raters have in common with person
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [34]:
userSubsetGroup[0]

(43,
       userId  movieId  rating
 5858      43        1     4.0
 5859      43        2     3.5
 5884      43      296     5.0
 5974      43     1274     4.5
 6018      43     1968     4.5)

In [35]:
print(type(userSubsetGroup))  # Should be a list
print(type(userSubsetGroup[0]))  # Should be a tuple
print(type(userSubsetGroup[0][1]))  # Should be a pandas DataFrame


<class 'list'>
<class 'tuple'>
<class 'pandas.core.frame.DataFrame'>


keeps only the top 100 most similar to person

In [36]:
userSubsetGroup = userSubsetGroup[0:100]

using pearson correlation to know how similar they are

In [37]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user in every group in our subset
for name, group in userSubsetGroup:
    #sort by movieid
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula, caculating how many common movies they have
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    # current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #pearson correlation formula
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #checking not to have 0 in base and devide to complete the formula
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [38]:
pearsonCorrelationDict

{43: 0.9460287597114506,
 171: 0.3288967572401365,
 440: 0.9615384615384616,
 597: 0.4688072309384957,
 695: 0.8770580193070306,
 757: 0.4385290096535153,
 772: 0.584705346204686,
 860: 0.8594395636904102,
 939: 0.716114874039432,
 1203: -0.41671450165524815,
 1242: 0.711233325153824,
 1401: 0.07520710469952328,
 1652: 0.537086155529574,
 1894: 0.6020183016345586,
 1977: -0.323875137815648,
 2345: 0.43852900965351493,
 2429: 0.4385290096535153,
 2469: 0.7161148740394331,
 2640: -0.5264497328966636,
 2766: -0.21926450482675575,
 2982: 0.7307692307692307,
 3150: -0.41602514716892186,
 3266: 0.657793514480272,
 3272: 0.0,
 3274: 0.179028718509858,
 3624: 0.6698641270570834,
 3660: 0.7757911135427185,
 3760: 0.5860090386731196,
 4006: 0.179028718509858,
 4019: 0.39666441401095853,
 4060: 0.32328787506992,
 4246: 0.89514359254929,
 4410: 0.5288858853479448,
 4420: 0.657793514480273,
 4459: 0.8127665166512523,
 4675: 0.537086155529574,
 4975: 0.9176462238110027,
 5114: -0.1860521018838127,
 

making a dataframe from pearson dict verticaly

In [39]:
#converting every key and value in pearson dict to a dataframe, every row for 1 key vertically
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
#renaming the dataframe column
pearsonDF.columns = ['similarityIndex']
#creating new column named userId and having the values of index
pearsonDF['userId'] = pearsonDF.index
#making the index new and easy to work
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.946029,43
1,0.328897,171
2,0.961538,440
3,0.468807,597
4,0.877058,695


sorting ascending by similarity

In [40]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
99,0.983092,11495
64,0.970725,7723
55,0.962435,6550
94,0.961678,10960
63,0.961678,7571


Combining top users and ratings based on same userId

In [41]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.983092,11495,1,3.0
1,0.983092,11495,2,1.0
2,0.983092,11495,6,4.5
3,0.983092,11495,10,4.5
4,0.983092,11495,16,4.5


Multiplying similarity matrix and Weighted Ratings matrix making Rating and Weighted Matrix

In [42]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.983092,11495,1,3.0,2.949275
1,0.983092,11495,2,1.0,0.983092
2,0.983092,11495,6,4.5,4.423913
3,0.983092,11495,10,4.5,4.423913
4,0.983092,11495,16,4.5,4.423913


Making a temporary dataframe grouped by every movie and multiplied by similarity and ratings making weighted sum

In [43]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,39.973168,152.887748
2,39.973168,98.535608
3,12.129644,31.955678
4,2.447945,5.662576
5,11.35565,23.548374


making the recommendation matrix by using the formula 

In [44]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.824759,1
2,2.465044,2
3,2.634511,3
4,2.313196,4
5,2.073714,5


soritng and Recommendation matrix based on Weight sum

In [45]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
70927,5.0,70927
111235,5.0,111235
2931,5.0,2931
6660,5.0,6660
1169,5.0,1169
299,5.0,299
96606,5.0,96606
6583,5.0,6583
320,5.0,320
84952,5.0,84952


Finding the names of recommended movies

In [46]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
295,299,Priest,1994
316,320,Suture,1993
1141,1169,American Dream,1990
2839,2931,Time of the Gypsies (Dom za vesanje),1989
6460,6583,"Blood of Heroes, The (Salute of the Jugger, The)",1989
6537,6660,"Red Shoes, The",1948
13693,70927,To Each His Own Cinema (Chacun son cinéma ou C...,2007
16097,84952,Confessions (Kokuhaku),2010
18452,96606,Samsara,2011
21575,111235,Jodorowsky's Dune,2013


In [47]:
movies_df = pd.read_csv('movies.csv')

movies_df['genres'] = movies_df['genres'].fillna('')

cv = CountVectorizer(
    tokenizer=lambda x: x.split('|'),
    token_pattern=None
)

genre_matrix = cv.fit_transform(movies_df['genres'])


In [48]:
def recommend(movie_name, top_n=10):

    movie_name = movie_name.lower()

    matching_movies = movies_df[movies_df['title'].str.lower().str.contains(movie_name)]

    if matching_movies.empty:
        print("Movie not found")
        return

    idx = matching_movies.index[0]

    # Compute similarity ONLY for one movie
    similarity_scores = cosine_similarity(
        genre_matrix[idx],
        genre_matrix
    ).flatten()

    # Get top recommendations
    similar_indices = similarity_scores.argsort()[::-1][1:top_n+1]

    print("\nRecommended movies:\n")

    for i in similar_indices:
        print(movies_df.iloc[i]['title'])


In [49]:
recommend("Toy Story")



Recommended movies:

Turbo (2013)
Here Comes the Grump (2018)
DuckTales: The Movie - Treasure of the Lost Lamp (1990)
UglyDolls (2019)
Tale of Despereaux, The (2008)
Wild, The (2006)
The Magic Crystal (2011)
Puss in Book: Trapped in an Epic Tale (2017)
Trolls Holiday (2017)
Scooby-Doo! Mask of the Blue Falcon (2012)


In [50]:
recommend("Batman")



Recommended movies:

Esa ja Vesa - auringonlaskun ratsastajat (1994)
Why Me? (1990)
Rumble in the Bronx (Hont faan kui) (1995)
Ladrón que roba a ladrón (2007)
99 and 44/100% Dead (1974)
K-9: P.I. (2002)
Kingsman: The Secret Service (2015)
Emil and the Detectives (1964)
I Spy (2002)
Batman Forever (1995)


In [51]:
recommend("Avengers")



Recommended movies:

Troy the Odyssey (2017)
Le Mans (1971)
Three Musketeers, The (2011)
Tarzan and the Valley of Gold (1966)
Big Game (2014)
Golden Swallow (1968)
Expendables 2, The (2012)
Delta Force 3: The Killing Game (1991)
Arena, The (a.k.a. Naked Warriors) (1974)
The Ark of the Sun God (1984)


In [None]:
movie = input("Enter movie name: ")
recommend(movie)
