In [1]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [2]:
path = "movies.csv"                                     #MOVIES.CSV
moviescsv=pd.read_csv(path, usecols=['movieId','title','genres'], dtype={'movieId':'int32','title':'str'})
moviescsv.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
moviescsv.drop_duplicates(subset="title", keep='last',inplace=True)  #The 'inplace=True' ensures that the original 'movies' DataFrame is modified.
moviescsv

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
import re #regularexpressions
import string
def clean_text_func(text):
    '''Make text lowercase, removing special characters'''
    text = text.lower()
    # remove multiple whitespaces
    text = re.sub('\s+',' ', text)
    # remove characters that not words
    text=re.sub('[^a-zA-Z0-9]',' ',text)
    return text

clean_text = lambda x: clean_text_func(x)    #This lambda function allows us to apply the cleaning function conveniently in a single line.

In [5]:
moviescsv.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [6]:
if 'all_data' not in moviescsv:                #this lines prevent from re-runing the data which causes multiple datasets.
    moviescsv['all_data'] = moviescsv[moviescsv.columns[3:]].apply(
        lambda x: ' '.join(x.astype(str)),
        axis=1
        )

In [7]:
import nltk
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import re
nltk.download('wordnet')

def lemma_tokens(tokens, lemma):
    lemma_result = [lemma.lemmatize(item) for item in tokens]
    return(lemma_result)

def tokenize(text):
    lemma = nltk.WordNetLemmatizer()
    text = re.sub("[^a-zA-Z]", " ", text)
    tokens = nltk.word_tokenize(text)

    # English Stop words
    # Re-add the additional stop words since we are recreating the document-term matrix
    stop_words = stopwords.words('english')

    # Remove all stop words: no_stops
    no_stops = [t for t in tokens if t not in stop_words]

    lemma = lemma_tokens(no_stops, lemma)
    return(lemma)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gauur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
path2 = "ratings.csv"                                 #RATINGS.CSV
ratings = pd.read_csv(path2,
                      usecols=['userId', 'movieId', 'rating', 'timestamp'],
                      dtype={'userId': 'Int32', 'movieId': 'Int32', 'rating': 'float32'},
                      na_values={'userId': -1, 'movieId': -1, 'rating': 0.0})

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880000.0
1,1,306,3.5,1147869000.0
2,1,307,5.0,1147869000.0
3,1,665,5.0,1147879000.0
4,1,899,3.5,1147869000.0


In [9]:
ratings.dtypes

userId         Int32
movieId        Int32
rating       float32
timestamp    float64
dtype: object

In [10]:
tags = pd.read_csv("tags.csv")
tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472000.0
1,3,260,sci-fi,1439472000.0


In [11]:
ratings_t = ratings.groupby('userId').filter(lambda x: len(x) > 80)
ratings_t #limit ratings to user ratings that have rated more than 80 movies

Unnamed: 0,userId,movieId,rating,timestamp
70,2,1,3.5,1.141416e+09
71,2,62,0.5,1.141417e+09
72,2,110,5.0,1.141417e+09
73,2,150,4.0,1.141416e+09
74,2,151,4.5,1.141416e+09
...,...,...,...,...
3077630,20261,79132,5.0,1.449025e+09
3077631,20261,80463,5.0,1.449026e+09
3077632,20261,80489,5.0,1.449027e+09
3077633,20261,81562,5.0,1.449026e+09


In [12]:
ratings_t.shape

(2615732, 4)

In [13]:
movie_list_rating = ratings_t.movieId.unique().tolist() #list the movie titles that survive the filtering
movie_list_rating

[1,
 62,
 110,
 150,
 151,
 236,
 260,
 261,
 266,
 318,
 333,
 349,
 356,
 364,
 380,
 457,
 480,
 497,
 524,
 527,
 534,
 553,
 588,
 589,
 653,
 733,
 858,
 914,
 953,
 1035,
 1080,
 1136,
 1196,
 1197,
 1198,
 1201,
 1210,
 1246,
 1257,
 1270,
 1271,
 1275,
 1283,
 1287,
 1291,
 1293,
 1299,
 1302,
 1356,
 1374,
 1376,
 1393,
 1431,
 1465,
 1485,
 1488,
 1527,
 1580,
 1584,
 1587,
 1610,
 1653,
 1672,
 1674,
 1682,
 1693,
 1722,
 1873,
 1907,
 1923,
 1957,
 1968,
 2028,
 2081,
 2083,
 2115,
 2138,
 2139,
 2150,
 2194,
 2268,
 2273,
 2294,
 2324,
 2355,
 2359,
 2406,
 2470,
 2496,
 2501,
 2571,
 2617,
 2640,
 2643,
 2694,
 2720,
 2745,
 2761,
 2762,
 2797,
 2918,
 2987,
 3098,
 3105,
 3107,
 3114,
 3148,
 3175,
 3360,
 3396,
 3479,
 3510,
 3578,
 3793,
 3889,
 3916,
 3948,
 3994,
 4016,
 4019,
 4022,
 4023,
 4103,
 4299,
 4306,
 4535,
 4571,
 4720,
 4857,
 4874,
 4886,
 4896,
 4963,
 4993,
 4995,
 5010,
 5103,
 5349,
 5418,
 5445,
 5574,
 5816,
 5952,
 5955,
 6156,
 6157,
 6213,
 62

In [14]:
len(ratings_t.movieId.unique())/len(moviescsv.movieId.unique()) * 100 #we still have 97% of the original movie titles in ratings dataframe

50.685920577617324

In [15]:
len(ratings_t.userId.unique())/len(ratings.userId.unique()) * 100 #we have only 44% of the users, So now lets filter the movies data frame

45.145846700557726

In [16]:
movies = moviescsv[moviescsv.movieId.isin(movie_list_rating)]
movies.shape

(31520, 4)

In [17]:
mapping_file = dict(zip(movies.title.tolist(), movies.movieId.tolist()))
mapping_file

{'Toy Story (1995)': 1,
 'Jumanji (1995)': 2,
 'Grumpier Old Men (1995)': 3,
 'Waiting to Exhale (1995)': 4,
 'Father of the Bride Part II (1995)': 5,
 'Heat (1995)': 6,
 'Sabrina (1995)': 7,
 'Tom and Huck (1995)': 8,
 'Sudden Death (1995)': 9,
 'GoldenEye (1995)': 10,
 'American President, The (1995)': 11,
 'Dracula: Dead and Loving It (1995)': 12,
 'Balto (1995)': 13,
 'Nixon (1995)': 14,
 'Cutthroat Island (1995)': 15,
 'Casino (1995)': 16,
 'Sense and Sensibility (1995)': 17,
 'Four Rooms (1995)': 18,
 'Ace Ventura: When Nature Calls (1995)': 19,
 'Money Train (1995)': 20,
 'Get Shorty (1995)': 21,
 'Copycat (1995)': 22,
 'Assassins (1995)': 23,
 'Powder (1995)': 24,
 'Leaving Las Vegas (1995)': 25,
 'Othello (1995)': 26,
 'Now and Then (1995)': 27,
 'Persuasion (1995)': 28,
 'City of Lost Children, The (Cité des enfants perdus, La) (1995)': 29,
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)': 30,
 'Dangerous Minds (1995)': 31,
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)':

In [18]:
tags.drop(['timestamp'], axis = 1, inplace = True)
ratings_t.drop(['timestamp'], axis = 1, inplace = True)

In [19]:
mixed = pd.merge(movies, tags, on='movieId', how = 'left')
mixed.head()

Unnamed: 0,movieId,title,genres,all_data,userId,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,791.0,Owned
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,1048.0,imdb top 250
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,1361.0,Pixar
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,3164.0,Pixar
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,3164.0,time travel


In [20]:
mixed.fillna("", inplace = True)
mixed = pd.DataFrame(mixed.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))

  mixed.fillna("", inplace = True)


In [21]:
mixed

Unnamed: 0_level_0,tag
movieId,Unnamed: 1_level_1
1,Owned imdb top 250 Pixar Pixar time travel chi...
2,Robin Williams time travel fantasy
3,funny
4,
5,
...,...
209049,
209053,
209055,
209103,


In [22]:
Final = pd.merge(movies, mixed, on='movieId', how = 'left')
Final

Unnamed: 0,movieId,title,genres,all_data,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,Owned imdb top 250 Pixar Pixar time travel chi...
1,2,Jumanji (1995),Adventure|Children|Fantasy,,Robin Williams time travel fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,,funny
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,,
4,5,Father of the Bride Part II (1995),Comedy,,
...,...,...,...,...,...
31515,209049,No Safe Spaces (2019),Documentary,,
31516,209053,Bowling (2012),Comedy,,
31517,209055,"Very Well, Thank You (2007)",Comedy|Drama,,
31518,209103,Tsar Ivan the Terrible (1991),(no genres listed),,


In [23]:
Final.drop(['all_data'], axis = 1, inplace = True)
Final

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Owned imdb top 250 Pixar Pixar time travel chi...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams time travel fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,funny
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,
4,5,Father of the Bride Part II (1995),Comedy,
...,...,...,...,...
31515,209049,No Safe Spaces (2019),Documentary,
31516,209053,Bowling (2012),Comedy,
31517,209055,"Very Well, Thank You (2007)",Comedy|Drama,
31518,209103,Tsar Ivan the Terrible (1991),(no genres listed),


In [24]:
Final ['metadata'] = Final[['tag', 'genres']].apply(lambda x: ' '.join(x), axis = 1)
Final

Unnamed: 0,movieId,title,genres,tag,metadata
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Owned imdb top 250 Pixar Pixar time travel chi...,Owned imdb top 250 Pixar Pixar time travel chi...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams time travel fantasy,Robin Williams time travel fantasy Adventure|C...
2,3,Grumpier Old Men (1995),Comedy|Romance,funny,funny Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,,Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy,,Comedy
...,...,...,...,...,...
31515,209049,No Safe Spaces (2019),Documentary,,Documentary
31516,209053,Bowling (2012),Comedy,,Comedy
31517,209055,"Very Well, Thank You (2007)",Comedy|Drama,,Comedy|Drama
31518,209103,Tsar Ivan the Terrible (1991),(no genres listed),,(no genres listed)


In [25]:
ratings_t

Unnamed: 0,userId,movieId,rating
70,2,1,3.5
71,2,62,0.5
72,2,110,5.0
73,2,150,4.0
74,2,151,4.5
...,...,...,...
3077630,20261,79132,5.0
3077631,20261,80463,5.0
3077632,20261,80489,5.0
3077633,20261,81562,5.0


In [26]:
Final.shape

(31520, 5)

In [27]:
#colllaborative for the user ratings:

In [28]:
ratings_t.head()

Unnamed: 0,userId,movieId,rating
70,2,1,3.5
71,2,62,0.5
72,2,110,5.0
73,2,150,4.0
74,2,151,4.5


In [29]:
ratings_t.shape

(2615732, 3)

In [30]:
ratings_t1 = pd.merge(movies[['movieId']], ratings_t, on='movieId', how = 'right')

In [31]:
ratings_t1.shape

(2615732, 3)

In [32]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [33]:
movies_users= ratings_t.pivot(index='movieId', columns='userId',values='rating').fillna(0)

In [34]:
movies_users.shape

(31590, 9147)

In [35]:
mat_movies_users=csr_matrix(movies_users.values)

In [36]:
model_knn= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)


In [37]:
model_knn.fit(mat_movies_users)


In [38]:
from fuzzywuzzy import process #this is to get index of the movie


In [39]:
def recommender(movie_name, data,model, n_recommendations ):
    model.fit(data)
    idx=process.extractOne(movie_name, movies['title'])[2]
    print('Movie Selected: ',movies['title'][idx], 'Index: ',idx)
    print('Searching for recommendations.....')
    distances, indices=model.kneighbors(data[idx], n_neighbors=n_recommendations)
    for i in indices:
        print(movies['title'][i])
    


In [40]:
recommender('Jumanji (1995)', mat_movies_users, model_knn,20)

Movie Selected:  Jumanji (1995) Index:  1
Searching for recommendations.....
1                                          Jumanji (1995)
352                    Four Weddings and a Funeral (1994)
476                                     Kalifornia (1993)
3000                          McCabe & Mrs. Miller (1971)
1237                            Back to the Future (1985)
258                                   Little Women (1994)
360                                  Little Buddha (1993)
1179    Star Wars: Episode VI - Return of the Jedi (1983)
1166    Star Wars: Episode V - The Empire Strikes Back...
581                     Terminator 2: Judgment Day (1991)
2465                                   Telling You (1998)
1522                               For Ever Mozart (1996)
1168    Raiders of the Lost Ark (Indiana Jones and the...
293                                        Panther (1995)
4158                                      Scarface (1983)
315                                  Shallow Grave (1

In [60]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error