## Reference: https://towardsdatascience.com/how-to-build-from-scratch-a-content-based-movie-recommender-with-natural-language-processing-25ad400eb243

In [94]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


#importing movies_df
movies_df = pd.read_csv('movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [95]:
ratings_df = pd.read_csv('ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [96]:
ratings_df.drop(['timestamp','userId'],axis = 1, inplace = True)

In [97]:
ratings_df.head()

Unnamed: 0,movieId,rating
0,296,5.0
1,306,3.5
2,307,5.0
3,665,5.0
4,899,3.5


In [98]:
ratings_mean = ratings_df.groupby('movieId')['rating'].mean()
print(ratings_mean)

movieId
1         3.893708
2         3.251527
3         3.142028
4         2.853547
5         3.058434
            ...   
209157    1.500000
209159    3.000000
209163    4.500000
209169    3.000000
209171    3.000000
Name: rating, Length: 59047, dtype: float64


In [99]:
#Checking if previous number of movieId's are correct
ratings_df['movieId'].nunique()

59047

In [100]:
#Remove unused dataframes to save space
del ratings_df

In [101]:
#Merge on movieId
merge_df = pd.merge(movies_df, ratings_mean, on = 'movieId' )
#merge_df.columns = ['movieId','title','genres','average_rating']    #changing the name of the rating column to average_rating

#Or can rename only specific column
merge_df.rename(columns={'rating': 'average_rating'}, inplace=True)
merge_df.head()

Unnamed: 0,movieId,title,genres,average_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.893708
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.251527
2,3,Grumpier Old Men (1995),Comedy|Romance,3.142028
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.853547
4,5,Father of the Bride Part II (1995),Comedy,3.058434


In [102]:
merge_df.dtypes

movieId             int64
title              object
genres             object
average_rating    float64
dtype: object

In [103]:
#Remove unused dataframes to save space
del movies_df
del ratings_mean

In [104]:
#remove seperator from genres column
# for index, row in merge_df.iterrows():
#     genre = row['genres']
#     row['genres'] = genre.split('|')
#     row['genres'] = ' '.join(row['genres'])
    
merge_df['genres'] = merge_df['genres'].str.split('|')

In [105]:
merge_df.head()

Unnamed: 0,movieId,title,genres,average_rating
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",3.893708
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",3.251527
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",3.142028
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",2.853547
4,5,Father of the Bride Part II (1995),[Comedy],3.058434


In [106]:
merge_df.dtypes

movieId             int64
title              object
genres             object
average_rating    float64
dtype: object

In [107]:
#merge_df['genres'] = merge_df['genres'].apply(lambda x: ''.join(map(str, x)))

#merge_df['genres'] = merge_df.genres.apply(lambda x: ' '.join([str(i) for i in x]))

#Join all the genres together in a string
#referece: https://stackoverflow.com/questions/45306988/column-of-lists-convert-list-to-string-as-a-new-column
merge_df['genres'] = [' '.join(map(str, l)) for l in merge_df['genres']]
merge_df.head()

Unnamed: 0,movieId,title,genres,average_rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.893708
1,2,Jumanji (1995),Adventure Children Fantasy,3.251527
2,3,Grumpier Old Men (1995),Comedy Romance,3.142028
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.853547
4,5,Father of the Bride Part II (1995),Comedy,3.058434


In [108]:
# for index, row in merge_df.iterrows():
#     temp = row['genres']
#     temp1 = ' '.join(temp)
#     row['genres'] = temp1
#     #merge_df[index][row] = ' '.join(temp)
#Pandas doc: You should never modify something you are iterating over. This is not guaranteed to work in all cases. Depending on the data types, the iterator returns a copy and not a view, and writing to it will have no effect.

In [109]:
merge_df.head()

Unnamed: 0,movieId,title,genres,average_rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.893708
1,2,Jumanji (1995),Adventure Children Fantasy,3.251527
2,3,Grumpier Old Men (1995),Comedy Romance,3.142028
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.853547
4,5,Father of the Bride Part II (1995),Comedy,3.058434


In [110]:
merge_df.to_csv("df3.csv",index_label=False)


In [111]:
del merge_df

In [112]:
tags_df = pd.read_csv('tags.csv')

In [113]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598
3,4,1732,great dialogue,1573943604
4,4,7569,so bad it's good,1573943455


In [114]:
tags_df.drop(['timestamp','userId'],axis = 1, inplace = True)
tags_df.head()

Unnamed: 0,movieId,tag
0,260,classic
1,260,sci-fi
2,1732,dark comedy
3,1732,great dialogue
4,7569,so bad it's good


In [115]:
tags_df.dtypes

movieId     int64
tag        object
dtype: object

In [116]:
#Doesn't work because a tag row has float object
#a = tags_df.groupby('movieId')['tag'].sum()

#Doesn't work because a tag row has float object
#tags_df.groupby('movieId')['tag'].apply(' '.join)


#Make sure tag column has all string objects
#Reference: https://stackoverflow.com/questions/22005911/convert-columns-to-string-in-pandas
tags_df['tag'] = tags_df['tag'].astype(str)

#Group strings that have same movieId
#Reference: https://stackoverflow.com/questions/38127209/how-to-use-groupby-to-concatenate-strings-in-python-pandas
combine_tags = tags_df.groupby('movieId')['tag'].apply(' '.join)



In [117]:
combine_tags.head()

movieId
1    Owned imdb top 250 Pixar Pixar time travel chi...
2    Robin Williams time travel fantasy based on ch...
3    funny best friend duringcreditsstinger fishing...
4    based on novel or book chick flick divorce int...
5    aging baby confidence contraception daughter g...
Name: tag, dtype: object

In [118]:
#Checking if the grouped tags based on movieId make sense
tags_df.loc[tags_df['movieId'] == 1]

Unnamed: 0,movieId,tag
2306,1,Owned
3749,1,imdb top 250
4226,1,Pixar
10003,1,Pixar
10004,1,time travel
...,...,...
1084685,1,American Animation
1084688,1,computer animation
1084689,1,pixar
1088052,1,Pixar


In [119]:
#Remove unused dataframe
del tags_df

In [120]:
#Merge the groupby tags and the merge_df from df3.csv
merge_df = pd.read_csv('df3.csv')
merge_df = pd.merge(merge_df, combine_tags, on = 'movieId' )
merge_df.rename(columns={'tag': 'tags'}, inplace=True)
merge_df.head()

Unnamed: 0,movieId,title,genres,average_rating,tags
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.893708,Owned imdb top 250 Pixar Pixar time travel chi...
1,2,Jumanji (1995),Adventure Children Fantasy,3.251527,Robin Williams time travel fantasy based on ch...
2,3,Grumpier Old Men (1995),Comedy Romance,3.142028,funny best friend duringcreditsstinger fishing...
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.853547,based on novel or book chick flick divorce int...
4,5,Father of the Bride Part II (1995),Comedy,3.058434,aging baby confidence contraception daughter g...


In [121]:
#Change average_rating to string to concatenate
#merge_df['average_rating'] = merge_df['average_rating'].astype(str)

merge_df['bag_of_words'] = merge_df["title"] + " " + merge_df["genres"] + " " + merge_df["tags"] 
merge_df.head()

Unnamed: 0,movieId,title,genres,average_rating,tags,bag_of_words
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.893708,Owned imdb top 250 Pixar Pixar time travel chi...,Toy Story (1995) Adventure Animation Children ...
1,2,Jumanji (1995),Adventure Children Fantasy,3.251527,Robin Williams time travel fantasy based on ch...,Jumanji (1995) Adventure Children Fantasy Robi...
2,3,Grumpier Old Men (1995),Comedy Romance,3.142028,funny best friend duringcreditsstinger fishing...,Grumpier Old Men (1995) Comedy Romance funny b...
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.853547,based on novel or book chick flick divorce int...,Waiting to Exhale (1995) Comedy Drama Romance ...
4,5,Father of the Bride Part II (1995),Comedy,3.058434,aging baby confidence contraception daughter g...,Father of the Bride Part II (1995) Comedy agin...


In [122]:
merge_df.drop(['genres','tags'],axis = 1, inplace = True)
merge_df.head()

Unnamed: 0,movieId,title,average_rating,bag_of_words
0,1,Toy Story (1995),3.893708,Toy Story (1995) Adventure Animation Children ...
1,2,Jumanji (1995),3.251527,Jumanji (1995) Adventure Children Fantasy Robi...
2,3,Grumpier Old Men (1995),3.142028,Grumpier Old Men (1995) Comedy Romance funny b...
3,4,Waiting to Exhale (1995),2.853547,Waiting to Exhale (1995) Comedy Drama Romance ...
4,5,Father of the Bride Part II (1995),3.058434,Father of the Bride Part II (1995) Comedy agin...


In [123]:
count = CountVectorizer()
count_matrix = count.fit_transform(merge_df['bag_of_words'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [124]:
# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use in the function to match the indexes
indices = pd.Series(merge_df.index)

#  defining the function that takes in movie title 
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended movies
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[merge_df['title'] == title].index[0]


    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies   #update: 20 most similar movies
    top_10_indexes = list(score_series.iloc[1:21].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(merge_df.index)[i])
        
    return recommended_movies

In [128]:
recommends= recommendations('Lord of the Rings: The Fellowship of the Ring, The (2001)')
result = pd.DataFrame(columns = ['movieId','title','average_rating','bag_of_words'])
print(recommends)
for index in range(len(recommends)):
    result.loc[index] = merge_df.iloc[recommends[index]]
    
    #print(merge_df.iloc[elem])


[5553, 6702, 17729, 1942, 9995, 1898, 4546, 19128, 21529, 13059, 11081, 5426, 15801, 1972, 21704, 26749, 11644, 11824, 1875, 9956]


In [130]:
#Recommendations based on cosine similarity
result



Unnamed: 0,movieId,title,average_rating,bag_of_words
0,5952,"Lord of the Rings: The Two Towers, The (2002)",4.068051,"Lord of the Rings: The Two Towers, The (2002) ..."
1,7153,"Lord of the Rings: The Return of the King, The...",4.09034,"Lord of the Rings: The Return of the King, The..."
2,98809,"Hobbit: An Unexpected Journey, The (2012)",3.663229,"Hobbit: An Unexpected Journey, The (2012) Adve..."
3,2161,"NeverEnding Story, The (1984)",3.521222,"NeverEnding Story, The (1984) Adventure Childr..."
4,41566,"Chronicles of Narnia: The Lion, the Witch and ...",3.393433,"Chronicles of Narnia: The Lion, the Witch and ..."
5,2116,"Lord of the Rings, The (1978)",3.207337,"Lord of the Rings, The (1978) Adventure Animat..."
6,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,3.678158,Harry Potter and the Sorcerer's Stone (a.k.a. ...
7,106489,"Hobbit: The Desolation of Smaug, The (2013)",3.629776,"Hobbit: The Desolation of Smaug, The (2013) Ad..."
8,118696,The Hobbit: The Battle of the Five Armies (2014),3.510419,The Hobbit: The Battle of the Five Armies (201...
9,70802,"Secret of Moonacre, The (2008)",3.405405,"Secret of Moonacre, The (2008) Adventure Fanta..."


In [131]:
#Recommendation based on highest rated similar movies
sort_by_average_rating = result.sort_values('average_rating',ascending=False)
sort_by_average_rating

Unnamed: 0,movieId,title,average_rating,bag_of_words
1,7153,"Lord of the Rings: The Return of the King, The...",4.09034,"Lord of the Rings: The Return of the King, The..."
0,5952,"Lord of the Rings: The Two Towers, The (2002)",4.068051,"Lord of the Rings: The Two Towers, The (2002) ..."
12,88125,Harry Potter and the Deathly Hallows: Part 2 (...,3.906986,Harry Potter and the Deathly Hallows: Part 2 (...
19,40815,Harry Potter and the Goblet of Fire (2005),3.76788,Harry Potter and the Goblet of Fire (2005) Adv...
6,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,3.678158,Harry Potter and the Sorcerer's Stone (a.k.a. ...
2,98809,"Hobbit: An Unexpected Journey, The (2012)",3.663229,"Hobbit: An Unexpected Journey, The (2012) Adve..."
11,5816,Harry Potter and the Chamber of Secrets (2002),3.641784,Harry Potter and the Chamber of Secrets (2002)...
7,106489,"Hobbit: The Desolation of Smaug, The (2013)",3.629776,"Hobbit: The Desolation of Smaug, The (2013) Ad..."
3,2161,"NeverEnding Story, The (1984)",3.521222,"NeverEnding Story, The (1984) Adventure Childr..."
8,118696,The Hobbit: The Battle of the Five Armies (2014),3.510419,The Hobbit: The Battle of the Five Armies (201...


In [92]:
df1 = pd.read_csv("df3.csv")
df2 = pd.read_csv("tags.csv")
#df2_key = df2.Colname2

# creating a empty bucket to save result
df_result = pd.DataFrame(columns=(df1.columns.append(df2.columns)).unique())
df_result.to_csv("df3.csv",index_label=False)


# deleting df2 to save memory
del(df2)

def preprocess(x):
    df2=pd.merge(df1,x, on = 'movieId')
    df2.to_csv("df3.csv",mode="a",header=False,index=False)

reader = pd.read_csv("tags.csv", chunksize=1000) # chunksize depends with you colsize

[preprocess(r) for r in reader]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [None]:
# tags_df = pd.read_csv('tags.csv')
# tags_df.drop(['userId','timestamp'],axis = 1)

In [None]:
def preprocess(x):
    df2=pd.merge(df1,x, on = movieId)
    df2.to_csv("df3.csv",mode="a",header=False,index=False)

reader = pd.read_csv("tags.csv", chunksize=1000) # chunksize depends with you colsize

[preprocess(r) for r in reader]

In [None]:
merge_df.head()

In [None]:
# Reference:https://stackoverflow.com/questions/31765123/pandas-dataframe-merge-memor
# cellsfilepath = 'C:\\Path\To\Cells\CSVFile.csv'
# tp = pd.io.parsers.read_csv(cellsfilepath, sep=',', iterator=True, chunksize=1000)
# cell_s = pd.concat(tp, ignore_index=True)

# tagsfilepath = 'C:/Users/Juay/Documents/ML Projects/ml-25m/ml-25m/tags.csv'
# tp = pd.io.parsers.read_csv(tagsfilepath, sep=',', iterator=True, chunksize=1000)
# merge_df = pd.concat(tp, ignore_index=True)

In [None]:
merge_df = pd.merge(merge_df, tags_df, on ='movieId')
merge_df.head()

In [None]:
del tags_df

In [None]:
#Restart python interpretar
%reset -f