This recommender system is based on: Grimaldi,E. (2018, Oct 2). How to build a content-based movie recommender system with Natural Language Processing. Retrieved from: https://towardsdatascience.com/how-to-build-from-scratch-a-content-based-movie-recommender-with-natural-language-processing-25ad400eb243. 

Changes had been made to make the code provided to work.

Count Vectorizer:
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [1]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


#importing movies_df
movies_df = pd.read_csv('movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
#Checking ratings.csv 
ratings_df = pd.read_csv('ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [3]:
#Timestamp and userID isn't used in prediction so can drop it
ratings_df.drop(['timestamp','userId'],axis = 1, inplace = True)

In [4]:
ratings_df.head()

Unnamed: 0,movieId,rating
0,296,5.0
1,306,3.5
2,307,5.0
3,665,5.0
4,899,3.5


In [5]:
#Since the ratings_df have multiple entries for a movie, I want to get the average for each movie
ratings_mean = ratings_df.groupby('movieId')['rating'].mean()
print(ratings_mean)

movieId
1         3.893708
2         3.251527
3         3.142028
4         2.853547
5         3.058434
            ...   
209157    1.500000
209159    3.000000
209163    4.500000
209169    3.000000
209171    3.000000
Name: rating, Length: 59047, dtype: float64


In [6]:
#Checking if previous number of movieId's are correct
ratings_df['movieId'].nunique()

59047

In [7]:
#Remove unused dataframes to save space
del ratings_df

In [8]:
#Merge on movieId
merge_df = pd.merge(movies_df, ratings_mean, on = 'movieId' )

#This is to change the name of columns. Want to change column name 'rating' to 'average_rating'
#merge_df.columns = ['movieId','title','genres','average_rating']   

#Or can rename only specific column
merge_df.rename(columns={'rating': 'average_rating'}, inplace=True)
merge_df.head()

Unnamed: 0,movieId,title,genres,average_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.893708
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.251527
2,3,Grumpier Old Men (1995),Comedy|Romance,3.142028
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.853547
4,5,Father of the Bride Part II (1995),Comedy,3.058434


In [9]:
merge_df.dtypes

movieId             int64
title              object
genres             object
average_rating    float64
dtype: object

In [10]:
#Remove unused dataframes to save space
del movies_df
del ratings_mean

In [11]:
#Iterrating through each row is computationally ineffiecient
#remove seperator from genres column
# for index, row in merge_df.iterrows():
#     genre = row['genres']
#     row['genres'] = genre.split('|')
#     row['genres'] = ' '.join(row['genres'])
    
#Bettwer way is to split the genres first
merge_df['genres'] = merge_df['genres'].str.split('|')

In [12]:
merge_df.head()

Unnamed: 0,movieId,title,genres,average_rating
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",3.893708
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",3.251527
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",3.142028
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",2.853547
4,5,Father of the Bride Part II (1995),[Comedy],3.058434


In [13]:
merge_df.dtypes

movieId             int64
title              object
genres             object
average_rating    float64
dtype: object

In [14]:
#merge_df['genres'] = merge_df['genres'].apply(lambda x: ''.join(map(str, x)))
#merge_df['genres'] = merge_df.genres.apply(lambda x: ' '.join([str(i) for i in x]))

#Join all the genres together in a string
#referece: https://stackoverflow.com/questions/45306988/column-of-lists-convert-list-to-string-as-a-new-column
merge_df['genres'] = [' '.join(map(str, l)) for l in merge_df['genres']]
merge_df.head()

Unnamed: 0,movieId,title,genres,average_rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.893708
1,2,Jumanji (1995),Adventure Children Fantasy,3.251527
2,3,Grumpier Old Men (1995),Comedy Romance,3.142028
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.853547
4,5,Father of the Bride Part II (1995),Comedy,3.058434


In [15]:
###Pandas doc: You should never modify something you are iterating over. This is not guaranteed to work in all cases. Depending on the data types, the iterator returns a copy and not a view, and writing to it will have no effect.
#Example below does not work
# for index, row in merge_df.iterrows():
#     temp = row['genres']
#     temp1 = ' '.join(temp)
#     row['genres'] = temp1
#     #merge_df[index][row] = ' '.join(temp)


In [16]:
merge_df.head()

Unnamed: 0,movieId,title,genres,average_rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.893708
1,2,Jumanji (1995),Adventure Children Fantasy,3.251527
2,3,Grumpier Old Men (1995),Comedy Romance,3.142028
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.853547
4,5,Father of the Bride Part II (1995),Comedy,3.058434


In [17]:
#Write to a csv so I can delete the df to save space while dealing with another dataset
merge_df.to_csv("df3.csv",index_label=False)

In [18]:
del merge_df

In [19]:
tags_df = pd.read_csv('tags.csv')

In [20]:
#Checking tags_csv format
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [21]:
#Remove unwanted columns that will not be used in predictions
tags_df.drop(['timestamp','userId'],axis = 1, inplace = True)
tags_df.head()

Unnamed: 0,movieId,tag
0,260,classic
1,260,sci-fi
2,1732,dark comedy
3,1732,great dialogue
4,7569,so bad it's good


In [22]:
tags_df.dtypes

movieId     int64
tag        object
dtype: object

In [23]:
#I want to combine all the tags associated from a movie into a single row. Note that a movieID can occur in multiple rows

#One way to concatenate all the strings is to use the groupby function with the sum. Doesn't work in this case
#because one or more tag row has float object.
#a = tags_df.groupby('movieId')['tag'].sum()

#This also doesn't work because a tag row has float object
#tags_df.groupby('movieId')['tag'].apply(' '.join)

#Make sure tag column has all string objects
#Reference: https://stackoverflow.com/questions/22005911/convert-columns-to-string-in-pandas
tags_df['tag'] = tags_df['tag'].astype(str)

#Group strings that have same movieId
#Reference: https://stackoverflow.com/questions/38127209/how-to-use-groupby-to-concatenate-strings-in-python-pandas
combine_tags = tags_df.groupby('movieId')['tag'].apply(' '.join)



In [24]:
#Checking result of combination
combine_tags.head()

movieId
1    Owned imdb top 250 Pixar Pixar time travel chi...
2    Robin Williams time travel fantasy based on ch...
3    funny best friend duringcreditsstinger fishing...
4    based on novel or book chick flick divorce int...
5    aging baby confidence contraception daughter g...
Name: tag, dtype: object

In [25]:
#Checking if the grouped tags based on movieId make sense
tags_df.loc[tags_df['movieId'] == 1]

Unnamed: 0,movieId,tag
2306,1,Owned
3749,1,imdb top 250
4226,1,Pixar
10003,1,Pixar
10004,1,time travel
...,...,...
1084685,1,American Animation
1084688,1,computer animation
1084689,1,pixar
1088052,1,Pixar


In [26]:
#Remove unused dataframe
del tags_df

In [27]:
#Merge the groupby tags and the merge_df from df3.csv(which include movieID,titles,genres,average_rating columns)
merge_df = pd.read_csv('df3.csv')
merge_df = pd.merge(merge_df, combine_tags, on = 'movieId' )    #Combine on moveID
merge_df.rename(columns={'tag': 'tags'}, inplace=True)    #Changing column name 'tag' to 'tags'

merge_df.head()

Unnamed: 0,movieId,title,genres,average_rating,tags
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.893708,Owned imdb top 250 Pixar Pixar time travel chi...
1,2,Jumanji (1995),Adventure Children Fantasy,3.251527,Robin Williams time travel fantasy based on ch...
2,3,Grumpier Old Men (1995),Comedy Romance,3.142028,funny best friend duringcreditsstinger fishing...
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.853547,based on novel or book chick flick divorce int...
4,5,Father of the Bride Part II (1995),Comedy,3.058434,aging baby confidence contraception daughter g...


In [28]:
#Want to create a bag_of_words that combine all the columns except average_rating into a single column to pass on into the count vectorizer

###This is not a good idea. Should keep the average_ratings as a separate column 
#Change average_rating to string to concatenate
#merge_df['average_rating'] = merge_df['average_rating'].astype(str)

merge_df['bag_of_words'] = merge_df["title"] + " " + merge_df["genres"] + " " + merge_df["tags"] 
merge_df.head()

Unnamed: 0,movieId,title,genres,average_rating,tags,bag_of_words
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.893708,Owned imdb top 250 Pixar Pixar time travel chi...,Toy Story (1995) Adventure Animation Children ...
1,2,Jumanji (1995),Adventure Children Fantasy,3.251527,Robin Williams time travel fantasy based on ch...,Jumanji (1995) Adventure Children Fantasy Robi...
2,3,Grumpier Old Men (1995),Comedy Romance,3.142028,funny best friend duringcreditsstinger fishing...,Grumpier Old Men (1995) Comedy Romance funny b...
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.853547,based on novel or book chick flick divorce int...,Waiting to Exhale (1995) Comedy Drama Romance ...
4,5,Father of the Bride Part II (1995),Comedy,3.058434,aging baby confidence contraception daughter g...,Father of the Bride Part II (1995) Comedy agin...


In [29]:
merge_df.drop(['genres','tags'],axis = 1, inplace = True)
merge_df.head()

Unnamed: 0,movieId,title,average_rating,bag_of_words
0,1,Toy Story (1995),3.893708,Toy Story (1995) Adventure Animation Children ...
1,2,Jumanji (1995),3.251527,Jumanji (1995) Adventure Children Fantasy Robi...
2,3,Grumpier Old Men (1995),3.142028,Grumpier Old Men (1995) Comedy Romance funny b...
3,4,Waiting to Exhale (1995),2.853547,Waiting to Exhale (1995) Comedy Drama Romance ...
4,5,Father of the Bride Part II (1995),3.058434,Father of the Bride Part II (1995) Comedy agin...


In [30]:
count = CountVectorizer()
count_matrix = count.fit_transform(merge_df['bag_of_words'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [31]:
# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use in the function to match the indexes
indices = pd.Series(merge_df.index)

#  defining the function that takes in movie title 
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended movies
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[merge_df['title'] == title].index[0]


    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies   #update: 20 most similar movies
    top_10_indexes = list(score_series.iloc[1:21].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(merge_df.index)[i])
        
    return recommended_movies

In [40]:
#Testing the with example movies to get recommendations

recommends= recommendations('Lord of the Rings: The Fellowship of the Ring, The (2001)')
result = pd.DataFrame(columns = ['movieId','title','average_rating'])#Create an empty df with the specific columns
print(recommends)
for index in range(len(recommends)):
    result.loc[index] = merge_df.iloc[recommends[index]]    #Add the movies in to the result data frame
    


[5553, 6702, 17729, 1942, 9995, 1898, 4546, 19128, 21529, 13059, 11081, 5426, 15801, 1972, 21704, 26749, 11644, 11824, 1875, 9956]


In [41]:
#Recommendations based on cosine similarity. 
result

Unnamed: 0,movieId,title,average_rating
0,5952,"Lord of the Rings: The Two Towers, The (2002)",4.068051
1,7153,"Lord of the Rings: The Return of the King, The...",4.09034
2,98809,"Hobbit: An Unexpected Journey, The (2012)",3.663229
3,2161,"NeverEnding Story, The (1984)",3.521222
4,41566,"Chronicles of Narnia: The Lion, the Witch and ...",3.393433
5,2116,"Lord of the Rings, The (1978)",3.207337
6,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,3.678158
7,106489,"Hobbit: The Desolation of Smaug, The (2013)",3.629776
8,118696,The Hobbit: The Battle of the Five Armies (2014),3.510419
9,70802,"Secret of Moonacre, The (2008)",3.405405


In [42]:
#Recommendation based on highest average_rating 
sort_by_average_rating = result.sort_values('average_rating',ascending=False)
sort_by_average_rating

Unnamed: 0,movieId,title,average_rating
1,7153,"Lord of the Rings: The Return of the King, The...",4.09034
0,5952,"Lord of the Rings: The Two Towers, The (2002)",4.068051
12,88125,Harry Potter and the Deathly Hallows: Part 2 (...,3.906986
19,40815,Harry Potter and the Goblet of Fire (2005),3.76788
6,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,3.678158
2,98809,"Hobbit: An Unexpected Journey, The (2012)",3.663229
11,5816,Harry Potter and the Chamber of Secrets (2002),3.641784
7,106489,"Hobbit: The Desolation of Smaug, The (2013)",3.629776
3,2161,"NeverEnding Story, The (1984)",3.521222
8,118696,The Hobbit: The Battle of the Five Armies (2014),3.510419


In [35]:
######Unused codes below. Kept for later references

In [36]:
# df1 = pd.read_csv("df3.csv")
# df2 = pd.read_csv("tags.csv")
# #df2_key = df2.Colname2

# # creating a empty bucket to save result
# df_result = pd.DataFrame(columns=(df1.columns.append(df2.columns)).unique())
# df_result.to_csv("df3.csv",index_label=False)


# # deleting df2 to save memory
# del(df2)

# def preprocess(x):
#     df2=pd.merge(df1,x, on = 'movieId')
#     df2.to_csv("df3.csv",mode="a",header=False,index=False)

# reader = pd.read_csv("tags.csv", chunksize=1000) # chunksize depends with you colsize

# [preprocess(r) for r in reader]

In [37]:
###This was an attempt to merge the different dataframes given into a file. Because the files were too large, I tried to proccess
#it in batches. But later just resorted to deleting unused dataframes because it is easier.

# def preprocess(x):
#     df2=pd.merge(df1,x, on = movieId)
#     df2.to_csv("df3.csv",mode="a",header=False,index=False)

# reader = pd.read_csv("tags.csv", chunksize=1000) # chunksize depends with you colsize

# [preprocess(r) for r in reader]

In [38]:
# Reference:https://stackoverflow.com/questions/31765123/pandas-dataframe-merge-memor
# cellsfilepath = 'C:\\Path\To\Cells\CSVFile.csv'
# tp = pd.io.parsers.read_csv(cellsfilepath, sep=',', iterator=True, chunksize=1000)
# cell_s = pd.concat(tp, ignore_index=True)

# tagsfilepath = 'C:/Users/Juay/Documents/ML Projects/ml-25m/ml-25m/tags.csv'
# tp = pd.io.parsers.read_csv(tagsfilepath, sep=',', iterator=True, chunksize=1000)
# merge_df = pd.concat(tp, ignore_index=True)

In [39]:
#####Refreshes the python interpreter to save memory each trial

#Restart python interpretar
#%reset -f