In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from scipy.sparse import csr_matrix


import mongodb as md

import warnings; warnings.simplefilter('ignore')
movies = pd.read_csv("movies_metadata.csv")
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
movies = movies.drop([19730, 29503, 35587])
movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


small_links = pd.read_csv("small_links.csv")
ratings = pd.read_csv("ratings_small.csv")
movies = movies.rename(columns={'id':'movieId'})
movies["movieId"] = movies["movieId"].astype("int")

In [2]:
movies

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,movieId,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[Drama, Family]",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0,NaT
45462,False,,0,[Drama],,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0,2011
45463,False,,0,"[Action, Drama, Thriller]",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0,2003
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0,1917


In [3]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [4]:
knn_ratings = ratings[["userId","movieId","rating"]]
knn_movies = movies[["movieId","title"]]
knn_movie_ratings = pd.merge(knn_movies,knn_ratings,on="movieId")

movie_rating_count = (knn_movie_ratings.
                      groupby(by=["title"])["rating"].
                      count().reset_index().
                      rename(columns={'rating':'totalRating'})
                      [["title","totalRating"]]
                     )
rating_with_totalRatingCount=knn_movie_ratings.merge(movie_rating_count,left_on='title',right_on='title',how="inner")

popularity_threshold=50
rating_popular_movie=rating_with_totalRatingCount[rating_with_totalRatingCount['totalRating']>popularity_threshold]
rating_popular_movie.head()

Unnamed: 0,movieId,title,userId,rating,totalRating
98,5,Four Rooms,15,4.5,56
99,5,Four Rooms,18,3.0,56
100,5,Four Rooms,44,3.0,56
101,5,Four Rooms,55,3.0,56
102,5,Four Rooms,69,5.0,56


In [5]:
combined = rating_popular_movie.drop_duplicates(['userId','title'])
knn_users = combined.pivot(index="title",columns="userId",values="rating").fillna(0)
knn_rating_user_csr = csr_matrix(knn_users.values)

In [6]:
from sklearn.neighbors import NearestNeighbors
model_knn=NearestNeighbors(metric="cosine",algorithm="brute")
model_knn.fit(knn_rating_user_csr)

NearestNeighbors(algorithm='brute', metric='cosine')

In [28]:
query_index=0
print(knn_users.iloc[query_index,:])
 
distances,indices=model_knn.kneighbors(knn_users.iloc[query_index,:].values.reshape(1,-1),n_neighbors=10)
print("Distances -->",distances," Indices -->",indices)
 
print(distances.flatten())
print(len(distances.flatten()))
 
for i in range(0,len(distances.flatten())):
    if i==0:
        print("Recommendation for {0}:\n".format(knn_users.index[query_index]))
    else:
        print("{0}: {1}, with distance of {2}:".format(i,knn_users.index[indices.flatten()[i]],distances.flatten()[i]))

userId
1      0.0
2      0.0
3      0.0
4      3.0
5      0.0
      ... 
667    0.0
668    0.0
669    0.0
670    0.0
671    0.0
Name: 20,000 Leagues Under the Sea, Length: 665, dtype: float64
Distances --> [[6.66133815e-16 5.01715380e-01 5.48363523e-01 5.53317132e-01
  5.89048291e-01 5.89927966e-01 5.95636503e-01 5.97438486e-01
  5.97652091e-01 6.04295944e-01]]  Indices --> [[  0 131 223  20  42 134 179   6 155  96]]
[6.66133815e-16 5.01715380e-01 5.48363523e-01 5.53317132e-01
 5.89048291e-01 5.89927966e-01 5.95636503e-01 5.97438486e-01
 5.97652091e-01 6.04295944e-01]
10
Recommendation for 20,000 Leagues Under the Sea:

1: Popular Music, with distance of 0.5017153801702724:
2: Tough Enough, with distance of 0.5483635229149659:
3: Back to the Future Part II, with distance of 0.553317132295827:
4: Cockles and Muscles, with distance of 0.5890482912168521:
5: Rain Man, with distance of 0.589927965563734:
6: The Conversation, with distance of 0.5956365026359962:
7: A Nightmare on Elm Street

In [52]:
import seaborn as sb # visualizations
columns = ['distances','indices']
df = pd.DataFrame(columns=columns)
df['distances'] = pd.Series(distances[0])
df['indices'] = pd.Series(indices[0])

In [9]:
user_movie_rating = knn_movie_ratings.pivot_table(index='userId', columns='title', values='rating')
ratings_mean_count = pd.DataFrame(knn_movie_ratings.groupby('title')['rating'].mean())
ratings_mean_count['rating_counts'] = pd.DataFrame(knn_movie_ratings.groupby('title')['rating'].count())

def getCorelation(movieName):
    movieSelected = user_movie_rating[movieName]
    movieCorrelation = user_movie_rating.corrwith(movieSelected,method="pearson")
    df_movieCorrelation = pd.DataFrame(movieCorrelation, columns=['Correlation'])
    df_movieCorrelation.dropna(inplace=True)
    df_movieCorrelation = df_movieCorrelation.join(ratings_mean_count['rating_counts'])
    df_movieCorrelation = df_movieCorrelation[df_movieCorrelation ['rating_counts']>50].sort_values('Correlation', ascending=False).head()
    df_movieCorrelation = df_movieCorrelation.reset_index()
    return df_movieCorrelation

In [12]:
getCorelation("20,000 Leagues Under the Sea")

Unnamed: 0,title,Correlation,rating_counts
0,"20,000 Leagues Under the Sea",1.0,89
1,Rocky V,0.632242,54
2,Yesterday,0.524681,54
3,Rambo III,0.52204,66
4,Spanglish,0.521201,51


In [13]:
movies['overview'] = movies['overview'].fillna("")
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['overview'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
moviesSIM = movies.reset_index()
titles = moviesSIM['title']
indices = pd.Series(moviesSIM.index, index=moviesSIM['title'])

In [14]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices]

In [21]:
get_recommendations("Spanglish")

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,movieId,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
10615,False,,0,"[Comedy, Drama, Fantasy, Romance]",,42234,tt0077452,pt,Dona Flor e Seus Dois Maridos,"In a small city of Brazil, Flor (a very good l...",...,0.0,110.0,"[{'iso_639_1': 'pt', 'name': 'Português'}]",Released,,Dona Flor and Her Two Husbands,False,6.2,13.0,1976
39097,False,"{'id': 313086, 'name': 'The Conjuring Collecti...",40000000,[Horror],http://www.warnerbros.com/conjuring-2,259693,tt3065204,en,The Conjuring 2,Lorraine and Ed Warren travel to north London ...,...,320170008.0,134.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The next true story from the case files of Ed ...,The Conjuring 2,False,7.0,2018.0,2016
23967,False,,0,[],,358199,tt0173629,en,All the Way Home,A wife and mother in 1915 Tennessee copes with...,...,0.0,120.0,[],Released,,All the Way Home,False,7.0,1.0,1981
26924,False,,0,"[Romance, Adventure, Music, Drama, Western]",,43880,tt0028207,en,Rose Marie,"Opera singer, Marie de Flor, seeks out fugitiv...",...,0.0,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Jeanette MacDonald and Nelson Eddy in the most...,Rose Marie,False,0.0,0.0,1936
16055,False,,0,"[Drama, Romance]",,182756,tt0037980,en,Perfect Strangers,Robert Donat and Deborah Kerr as a married cou...,...,0.0,102.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Mr. Chips Is Back In A New Thrilling Romance!,Perfect Strangers,False,7.0,1.0,1945
42205,False,,0,"[Comedy, Drama, Romance]",http://thebigsickmovie.com,416477,tt5462602,en,The Big Sick,Pakistan-born comedian Kumail Nanjiani and gra...,...,52620184.0,120.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,An awkward true story.,The Big Sick,False,7.7,249.0,2017
20141,False,,0,"[Drama, TV Movie]",,237983,tt0280474,en,Just Ask My Children,Based on the true story of a modern-day witch ...,...,0.0,91.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A family torn apart by a system out of control,Just Ask My Children,False,7.2,5.0,2001
35625,False,,0,[Drama],http://www.bothemovie.com/,42748,tt1511329,nl,Bo,Fifteen year old Deborah meets Jennifer who st...,...,0.0,100.0,"[{'iso_639_1': 'nl', 'name': 'Nederlands'}, {'...",Released,,Bo,False,5.5,9.0,2010
241,False,,0,"[Drama, Action, Thriller, Crime]",,72031,tt0109906,en,The Glass Shield,J.J. is a rookie in the Sheriff's Department a...,...,0.0,109.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,In a world filled with violence... his only we...,The Glass Shield,False,5.4,9.0,1994
31740,False,,0,[Drama],,131116,tt0488027,pl,Plac Zbawiciela,"A story of a married couple with two children,...",...,0.0,105.0,"[{'iso_639_1': 'pl', 'name': 'Polski'}]",Released,,Saviour Square,False,7.4,4.0,2006
