In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
import mongodb as md

import warnings; warnings.simplefilter('ignore')



In [2]:
movies = md.read_mongo("finalyearproject","movies",True)
ratings = md.read_mongo("finalyearproject","reviews",True)
users = md.read_mongo("finalyearproject","users",True)
movies = movies.rename(columns={"_id":"on"})
movie_ratings = pd.merge(movies,ratings,on="on")
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
movies_ratings = movie_ratings.rename(columns={"title_x":"movieTitle","title_y":"rateTitle"})


In [3]:
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()

m = vote_counts.quantile(0.95)


qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())][['tmdb','title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')

def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

qualified['wr'] = qualified.apply(weighted_rating, axis=1)

qualified = qualified.sort_values('wr', ascending=False)
qualified


Unnamed: 0,tmdb,title,year,vote_count,vote_average,popularity,genres,wr
497,677638,We Bare Bears: The Movie,2020,9,4,97.699,"[Comedy, Animation, Adventure, Family, TV Movi...",2.351557
944,719088,"Yes, No, or Maybe Half?",2020,19,3,188.528,"[Animation, Drama, Romance, Animation, Drama]",2.254804
1419,804435,Vanquish,2021,15,3,79.213,"[Crime, Action, Thriller, Crime]",2.119314
1414,530079,Ride Your Wave,2019,14,3,81.208,"[Comedy, Animation, Fantasy, Drama, Romance, A...",2.077377
949,634528,The Marksman,2021,14,3,185.991,"[Crime, Action, Thriller, Crime]",2.077377
...,...,...,...,...,...,...,...,...
4,533514,Violet Evergarden: The Movie,2020,9,1,42.120,"[Animation, Fantasy, Drama, Romance, Animation...",0.664057
235,550205,Wish Dragon,2021,8,1,151.581,"[Comedy, Animation, Fantasy, Family, Animation...",0.641661
781,508947,Turning Red,2022,7,1,5329.202,"[Comedy, Animation, Fantasy, Family, Animation...",0.616065
965,763148,Time Is Up,2021,7,1,173.473,"[Drama, Romance, Drama]",0.616065


In [4]:
s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
genre_movies = movies.drop('genres', axis=1).join(s)

def build_chart(genre, percentile=0.85,limit=5):
    df = genre_movies[genre_movies['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title','tmdb', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False)
    qualified = qualified.drop_duplicates(['tmdb']).dropna().head(limit)
    
    return qualified
build_chart("Drama")


Unnamed: 0,title,tmdb,year,vote_count,vote_average,popularity,wr
421,Little Women,331482,2019,4,4,44.068,4.0
409,The Specials,579245,2019,2,4,19.751,4.0
1265,Fatherhood,607259,2021,1,4,105.293,4.0
1135,Tall Girl 2,772272,2022,7,3,117.267,3.0
874,The Last Duel,617653,2021,7,3,280.07,3.0


In [5]:
movies['overview'] = movies['overview'].fillna("")
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['overview'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
moviesSIM = movies.reset_index()
titles = moviesSIM['title']
indices = pd.Series(moviesSIM.index, index=moviesSIM['title'])

In [6]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices]


In [31]:
user_movie_rating = movies_ratings.pivot_table(index='userId', columns='tmdb', values='rate')
ratings_mean_count = pd.DataFrame(movies_ratings.groupby('tmdb')['rate'].mean())
ratings_mean_count['rating_counts'] = pd.DataFrame(movies_ratings.groupby('tmdb')['rate'].count())

def getCorelation(tmdb):
    movieSelected = user_movie_rating[tmdb]
    movieCorrelation = user_movie_rating.corrwith(movieSelected,method="pearson")
    df_movieCorrelation = pd.DataFrame(movieCorrelation, columns=['Correlation'])
    df_movieCorrelation.dropna(inplace=True)
    df_movieCorrelation = df_movieCorrelation.join(ratings_mean_count['rating_counts'])
    df_movieCorrelation = df_movieCorrelation[df_movieCorrelation ['rating_counts']>1].sort_values('Correlation', ascending=False)
    df_movieCorrelation = df_movieCorrelation.reset_index()
    return df_movieCorrelation["tmdb"]

In [32]:
getCorelation(719088)

0     640265
1     613504
2     899405
3     763148
4     744275
5     719088
6     710356
7     667520
8     652837
9     632322
10    630566
11    933357
12    447362
13    533514
14    531428
15    530079
16    632632
17    594634
18    772272
19    818647
20    785521
21    784500
22    779047
23    776503
24    523781
25    522924
26    568160
27    730154
28    527641
29    342470
30    537915
31    245842
Name: tmdb, dtype: int64

In [27]:
user_movie_rating

tmdb,38700,245842,301528,330457,331482,342470,359724,385128,385687,400160,...,899405,922885,926899,927855,928381,928769,930701,933357,937337,944664
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
625495df8ca5e225e5473995,,0.333333,1.5,,4.0,2.666667,,,,,...,3.0,,1.0,1.5,,4.0,0.5,3.5,0.666667,0.5
625499768ca5e225e5475a58,2.6,,,,,,,2.454545,,,...,,,,,3.142857,,,,,
6254b1e98ca5e225e548c632,,,,,,,,,,,...,,,,,,,,,,
6254b3e18ca5e225e548ce58,,,3.125,1.8,,,,,,2.833333,...,,,,,,,,,,
6254c08d8ca5e225e5494015,2.4,,,,,,,1.5,1.555556,,...,,,,,2.222222,,,,,
62556c3b8ca5e225e549587e,,3.5,,,4.0,2.833333,,,,,...,1.0,,,,,,,2.0,,
62582dc212014fc3392d66a1,,2.857143,,,,,3.0,,,,...,,2.583333,,,,,,,,


In [None]:
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import stopwords

In [None]:
plt.subplots(figsize=(12,12))
stop_words = set(stopwords.words('english'))
stop_words.update(',',';','!','?','.','(',')','$','#','+',':','...',' ','')

words=movies['overview'].dropna().apply(nltk.word_tokenize)
word=[]
for i in words:
    word.extend(i)
word=pd.Series(word)
word=([i for i in word.str.lower() if i not in stop_words])
wc = WordCloud(background_color="black", max_words=2000, stopwords=STOPWORDS, max_font_size= 60,width=1000,height=1000)
wc.generate(" ".join(word))
plt.imshow(wc)
plt.axis('off')
fig=plt.gcf()
fig.set_size_inches(10,10)
plt.show()

In [10]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import seaborn
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

In [42]:
knn_ratings = ratings[["userId","on","rate"]]
knn_movies = movies[["on","tmdb"]]
knn_movie_ratings = pd.merge(knn_movies,knn_ratings,on="on")


In [43]:
movie_rating_count = (knn_movie_ratings.
                      groupby(by=["tmdb"])["rate"].
                      count().reset_index().
                      rename(columns={'rate':'totalRating'})
                      [["tmdb","totalRating"]]
                     )
movie_rating_count

Unnamed: 0,tmdb,totalRating
0,38700,10
1,245842,19
2,301528,10
3,330457,5
4,331482,4
...,...,...
161,928769,1
162,930701,2
163,933357,10
164,937337,3


In [44]:
rating_with_totalRatingCount=knn_movie_ratings.merge(movie_rating_count,left_on='tmdb',right_on='tmdb',how="inner")
rating_with_totalRatingCount.head()
rating_with_totalRatingCount.info()
rating_with_totalRatingCount.describe()
rating_with_totalRatingCount['totalRating'].count()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1400 entries, 0 to 1399
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   on           1400 non-null   object
 1   tmdb         1400 non-null   int64 
 2   userId       1400 non-null   object
 3   rate         1400 non-null   int64 
 4   totalRating  1400 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 65.6+ KB


1400

In [45]:
popularity_threshold=3
#rating_popular_book=rating_with_totalRatingCount.query('totalRatingCount>=@popularity_threshold')
rating_popular_movie=rating_with_totalRatingCount[rating_with_totalRatingCount['totalRating']>popularity_threshold]
rating_popular_movie.head()

Unnamed: 0,on,tmdb,userId,rate,totalRating
0,624d882287b246f81c48c8b7,652837,625495df8ca5e225e5473995,3,6
1,624d882287b246f81c48c8b7,652837,625495df8ca5e225e5473995,2,6
2,624d882287b246f81c48c8b7,652837,625495df8ca5e225e5473995,5,6
3,624d882287b246f81c48c8b7,652837,62556c3b8ca5e225e549587e,3,6
4,624d882287b246f81c48c8b7,652837,62556c3b8ca5e225e549587e,0,6


In [46]:
combined = rating_popular_movie.merge(users,left_on='userId',right_on='_id',how="inner")
combined = combined.drop_duplicates(['userId','tmdb'])

In [47]:
knn_users = combined.pivot(index="tmdb",columns="_id",values="rate").fillna(0)
knn_rating_user_csr = csr_matrix(knn_users.values)

In [48]:
from sklearn.neighbors import NearestNeighbors
model_knn=NearestNeighbors(metric="cosine",algorithm="brute")
model_knn.fit(knn_rating_user_csr)

NearestNeighbors(algorithm='brute', metric='cosine')

In [77]:
query_index=4
distances,indices=model_knn.kneighbors(knn_users.iloc[query_index,:].values.reshape(1,-1),n_neighbors=5)
print("Distances -->",distances," Indices -->",indices)
 
print(distances.flatten())
print(len(distances.flatten()))
 
for i in range(0,len(distances.flatten())):
    if i==0:
        print("Recommendation for {0}:\n".format(knn_users.index[query_index]))
    else:
        print("{0}: {1}, with distance of {2}:".format(i,knn_users.index[indices.flatten()[i]],distances.flatten()[i]))

Distances --> [[0.         0.00048792 0.00375941 0.00611627 0.00611627]]  Indices --> [[  4  74  90  81 108]]
[0.         0.00048792 0.00375941 0.00611627 0.00611627]
5
Recommendation for 331482:

1: 632632, with distance of 0.00048792391292096227:
2: 719088, with distance of 0.0037594118043169367:
3: 652837, with distance of 0.006116265326380876:
4: 785521, with distance of 0.006116265326380876:


In [79]:
query_index=719088
new_shape = knn_users.reset_index()
values = new_shape[new_shape["tmdb"] == query_index].drop(columns=['tmdb']).values.reshape(1,-1)

distances,indices=model_knn.kneighbors(values,n_neighbors=5)
print("Distances -->",distances," Indices -->",indices)
 
print(distances.flatten())
print(len(distances.flatten()))
 
for i in range(0,len(distances.flatten())):
    if i==0:
        print("Recommendation for {0}:\n".format(query_index))
    else:
        print("{0}: {1}, with distance of {2}:".format(i,knn_users.index[indices.flatten()[i]],distances.flatten()[i]))

Distances --> [[0.         0.00113186 0.00153965 0.00375941 0.00772212]]  Indices --> [[90 73 74  4 13]]
[0.         0.00113186 0.00153965 0.00375941 0.00772212]
5
Recommendation for 719088:

1: 632322, with distance of 0.0011318622755625007:
2: 632632, with distance of 0.0015396467945874015:
3: 331482, with distance of 0.0037594118043169367:
4: 447362, with distance of 0.007722123286332372:


In [73]:
indices

array([[  4,  74,  90,  81, 108]])

In [69]:
knn_users.iloc[query_index,:].values.reshape(1,-1)

array([[5., 0., 0., 0., 0., 4., 0.]])

In [67]:
knn_users.reset_index()[knn_users.reset_index()["tmdb"] == 719088]

_id,tmdb,625495df8ca5e225e5473995,625499768ca5e225e5475a58,6254b1e98ca5e225e548c632,6254b3e18ca5e225e548ce58,6254c08d8ca5e225e5494015,62556c3b8ca5e225e549587e,62582dc212014fc3392d66a1
90,719088,3.0,0.0,0.0,0.0,0.0,2.0,0.0
