In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from scipy.sparse import csr_matrix


import mongodb as md

import warnings; warnings.simplefilter('ignore')
movies = pd.read_csv("movies_metadata.csv")
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
movies = movies.drop([19730, 29503, 35587])
movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


small_links = pd.read_csv("small_links.csv")
ratings = pd.read_csv("ratings_small.csv")
movies = movies.rename(columns={'id':'movieId'})
movies["movieId"] = movies["movieId"].astype("int")

In [2]:
movies

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,movieId,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[Drama, Family]",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0,NaT
45462,False,,0,[Drama],,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0,2011
45463,False,,0,"[Action, Drama, Thriller]",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0,2003
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0,1917


In [3]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [32]:
knn_ratings = ratings[["userId","movieId","rating"]]
knn_movies = movies[["movieId","title"]]
knn_movie_ratings = pd.merge(knn_movies,knn_ratings,on="movieId")

movie_rating_count = (knn_movie_ratings.
                      groupby(by=["title"])["rating"].
                      count().reset_index().
                      rename(columns={'rating':'totalRating'})
                      [["title","totalRating"]]
                     )
rating_with_totalRatingCount=knn_movie_ratings.merge(movie_rating_count,left_on='title',right_on='title',how="inner")

popularity_threshold=70
rating_popular_movie=rating_with_totalRatingCount[rating_with_totalRatingCount['totalRating']>popularity_threshold]
rating_popular_movie.head()

Unnamed: 0,movieId,title,userId,rating,totalRating
219,2054,Mr. Holland's Opus,4,3.0,76
220,2054,Mr. Holland's Opus,15,2.0,76
221,2054,Mr. Holland's Opus,30,1.0,76
222,2054,Mr. Holland's Opus,43,2.0,76
223,2054,Mr. Holland's Opus,49,2.0,76


In [33]:
combined = rating_popular_movie.drop_duplicates(['userId','title'])
combined = combined.pivot(index="title",columns="userId",values="rating").fillna(0)
knn_rating_user_csr = csr_matrix(combined.values)

In [34]:
from sklearn.neighbors import NearestNeighbors
model_knn=NearestNeighbors(metric="cosine",algorithm="brute")
model_knn.fit(knn_rating_user_csr)

NearestNeighbors(algorithm='brute', metric='cosine')

In [35]:
query_index=92
print(combined.iloc[query_index,:])
 
distances,indices=model_knn.kneighbors(combined.iloc[query_index,:].values.reshape(1,-1),n_neighbors=5)
print("Distances -->",distances," Indices -->",indices)
 
print(distances.flatten())
print(len(distances.flatten()))
 
for i in range(0,len(distances.flatten())):
    if i==0:
        print("Recommendation for {0}:\n".format(combined.index[query_index]))
    else:
        print("{0}: {1}, with distance of {2}:".format(i,combined.index[indices.flatten()[i]],distances.flatten()[i]))

userId
2      3.0
3      3.0
4      5.0
5      3.5
6      0.0
      ... 
667    0.0
668    0.0
669    0.0
670    0.0
671    4.0
Name: Silent Hill, Length: 658, dtype: float64
Distances --> [[1.11022302e-15 2.68051545e-01 3.03361260e-01 3.85584215e-01
  3.98009527e-01]]  Indices --> [[ 92 132  18  79  63]]
[1.11022302e-15 2.68051545e-01 3.03361260e-01 3.85584215e-01
 3.98009527e-01]
5
Recommendation for Silent Hill:

1: To Kill a Mockingbird, with distance of 0.26805154514014184:
2: Batman Returns, with distance of 0.30336126037089606:
3: Rain Man, with distance of 0.38558421543210364:
4: Monsoon Wedding, with distance of 0.39800952668320855:


In [36]:
columns = ['distances','indices']
df = pd.DataFrame(columns=columns)
df['distances'] = pd.Series(distances[0])
df['indices'] = pd.Series(indices[0])
df["title"] = combined.index[df["indices"]]

In [37]:
import plotly.express as px
fig = px.scatter(df, x="distances", y="indices")
fig.show()

In [15]:
user_movie_rating = knn_movie_ratings.pivot_table(index='userId', columns='title', values='rating')
ratings_mean_count = pd.DataFrame(knn_movie_ratings.groupby('title')['rating'].mean())
ratings_mean_count['rating_counts'] = pd.DataFrame(knn_movie_ratings.groupby('title')['rating'].count())

def getCorelation(movieName):
    movieSelected = user_movie_rating[movieName]
    movieCorrelation = user_movie_rating.corrwith(movieSelected,method="pearson")
    df_movieCorrelation = pd.DataFrame(movieCorrelation, columns=['Correlation'])
    df_movieCorrelation.dropna(inplace=True)
    df_movieCorrelation = df_movieCorrelation.join(ratings_mean_count['rating_counts'])
    df_movieCorrelation = df_movieCorrelation[df_movieCorrelation ['rating_counts']>5].sort_values('Correlation', ascending=False).head()
    df_movieCorrelation = df_movieCorrelation.reset_index()
    return df_movieCorrelation

In [11]:
getCorelation("20,000 Leagues Under the Sea")

Unnamed: 0,title,Correlation,rating_counts
0,"20,000 Leagues Under the Sea",1.0,89
1,No Reservations,0.980581,23
2,Krull,0.907035,28
3,We're No Angels,0.866025,26
4,Harry Potter and the Goblet of Fire,0.853381,21


In [12]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices]

In [13]:
get_recommendations("Spanglish")

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [16]:
columns = ['distances','indices']
df = pd.DataFrame(columns=columns)
df['distances'] = pd.Series(distances[0])
df['indices'] = pd.Series(indices[0])
df["title"] = combined.index[df["indices"]]
import plotly.express as px
fig = px.scatter(df, x="distances", y="indices")
fig.show()