In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
import mongodb as md

import warnings; warnings.simplefilter('ignore')



In [2]:
movies = md.read_mongo("finalyearproject","series",True)
ratings = md.read_mongo("finalyearproject","reviews",True)
users = md.read_mongo("finalyearproject","users",True)
movies = movies.rename(columns={"_id":"on"})
movie_ratings = pd.merge(movies,ratings,on="on")
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
movies_ratings = movie_ratings.rename(columns={"title_x":"movieTitle","title_y":"rateTitle"})
movies

Unnamed: 0,on,tmdb,title,overview,genres,genres_ids,vote_count,vote_average,popularity,release_date,year
0,625c92651b4a8150b1b27e74,76669,Elite,When three working class kids enroll in the mo...,"[Crime, Drama, Mystery]","[80, 18, 9648]",5,3.40,2644.519,2018-10-05,2018
1,625c92651b4a8150b1b27e70,92749,Moon Knight,"When Steven Grant, a mild-mannered gift-shop e...","[Action & Adventure, Drama, Sci-Fi & Fantasy]","[10759, 18, 10765]",2,3.50,11136.738,2022-03-30,2022
2,625c92651b4a8150b1b27e76,102045,The Choice,"The story of Ahmed Saber Al-Mansi, commander o...","[Action & Adventure, Drama, War & Politics]","[10759, 18, 10768]",4,2.75,1618.854,2020-04-24,2020
3,625c92651b4a8150b1b27e72,52814,Halo,Depicting an epic 26th-century conflict betwee...,"[Action & Adventure, Sci-Fi & Fantasy]","[10759, 10765]",5,4.40,9703.440,2022-03-24,2022
4,625c92651b4a8150b1b27e78,121745,Till Death,"Fed up with his wife’s distancing from him, Ha...",[Drama],[18],1,3.00,1507.532,2021-04-13,2021
...,...,...,...,...,...,...,...,...,...,...,...
1010,625c935c100ec9922fdfaebd,45016,The Bridge,When a body is found on the bridge between Den...,"[Crime, Mystery]","[80, 9648]",0,0.00,40.607,2011-09-21,2011
1011,625c935c100ec9922fdfaebf,42445,Borgen,40-year old political leader Birgitte Nyborg s...,[Drama],[18],0,0.00,34.841,2010-09-26,2010
1012,625c935c100ec9922fdfaec1,70540,Andi Mack,Andi is contemplative and artistic and shelter...,"[Drama, Comedy, Family]","[18, 35, 10751]",2,3.00,25.730,2017-03-07,2017
1013,625c935c100ec9922fdfaec3,1457,Pride and Prejudice,"Set in England in the early 19th century, Prid...",[Drama],[18],0,0.00,56.686,1995-09-24,1995


In [3]:
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()

m = vote_counts.quantile(0.95)


qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())][['tmdb','title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')

def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

qualified['wr'] = qualified.apply(weighted_rating, axis=1)
qualified = qualified.sort_values('wr', ascending=False)
qualified


Unnamed: 0,tmdb,title,year,vote_count,vote_average,popularity,genres,wr
544,94810,Adventure Time: Distant Lands,2020,23,4,94.913,"[Animation, Comedy, Family, Sci-Fi & Fantasy]",3.396092
877,61602,Cumbia Ninja,2013,10,4,84.417,"[Crime, Drama, Comedy, Sci-Fi & Fantasy]",2.93428
118,154490,From the Second Side with Guy Zohar,2019,98,3,308.512,"[News, War & Politics]",2.894122
423,61175,Steven Universe,2013,32,3,114.513,"[Action & Adventure, Animation, Comedy, Family...",2.714943
558,38693,Ninjago: Masters of Spinjitzu,2012,25,3,92.998,"[Action & Adventure, Animation, Comedy, Kids, ...",2.652586
874,39518,My Babysitter's a Vampire,2011,23,3,26.212,"[Action & Adventure, Comedy, Kids, Family, Sci...",2.629425
868,33880,The Legend of Korra,2012,22,3,74.01,"[Action & Adventure, Animation, Drama, Family,...",2.616647
786,65931,Bungo Stray Dogs,2016,21,3,56.469,"[Action & Adventure, Animation, Crime, Drama, ...",2.602956
308,61852,Henry Danger,2014,20,3,151.604,"[Action & Adventure, Drama, Comedy, Family, Sc...",2.58825
781,74440,Harley Quinn,2019,19,3,53.952,"[Action & Adventure, Animation, Crime, Comedy,...",2.572414


In [114]:
s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
genre_movies = movies.drop('genres', axis=1).join(s)

def build_chart(genre, percentile=0.85,limit=5):
    df = genre_movies[genre_movies['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title','tmdb', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False)
    qualified = qualified.drop_duplicates(['tmdb']).dropna().head(limit)
    
    return qualified
build_chart("Drama")


Unnamed: 0,title,tmdb,year,vote_count,vote_average,popularity,wr
445,Disney Intertwined,115304,2021,15,4,110.056,3.655523
381,Alice in Borderland,110316,2020,5,4,124.256,3.163412
399,Lupin,96677,2021,2,5,119.945,3.035971
814,WorldEnd: What are you doing at the end of the...,70639,2017,2,5,22.596,3.035971
644,"Love, Victor",97186,2020,2,5,64.664,3.035971


In [65]:
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
genre_movies = movies.drop('genres', axis=1).   join(s)
qualified = genre_movies.groupby(by="genre").sum("vote_average").sort_values(by=['vote_average'],ascending=False)
qualified

Unnamed: 0,title,tmdb,year,vote_count,vote_average,popularity,wr
838,Prodigal Son,91875,2019,4,3,34.189,3.0
50,Reacher,108978,2022,36,2,537.137,2.0
485,Malverde: The Patron Saint,135193,2021,26,2,103.894,2.0
893,Mare of Easttown,115004,2021,9,2,63.861,2.0
781,Harley Quinn,74440,2019,26,2,53.952,2.0


In [4]:
movies['overview'] = movies['overview'].fillna("")
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['overview'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
moviesSIM = movies.reset_index()
titles = moviesSIM['title']
indices = pd.Series(moviesSIM.index, index=moviesSIM['title'])

In [14]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    try:
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:31]
        movie_indices = [i[0] for i in sim_scores]
        return movies.iloc[movie_indices]
    except Exception as e:
        print(e)
        return []
        

In [15]:
get_recommendations("Suits")

The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()


[]

In [118]:
getCorelation(65931)

Unnamed: 0,tmdb,Correlation,rating_counts
0,65931,1.0,17
1,68267,1.0,24
2,94810,1.0,16
3,96203,1.0,12
4,135193,0.992778,11
5,74440,0.963784,12


In [None]:
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.corpus import stopwords

In [None]:
plt.subplots(figsize=(12,12))
stop_words = set(stopwords.words('english'))
stop_words.update(',',';','!','?','.','(',')','$','#','+',':','...',' ','')

words=movies['overview'].dropna().apply(nltk.word_tokenize)
word=[]
for i in words:
    word.extend(i)
word=pd.Series(word)
word=([i for i in word.str.lower() if i not in stop_words])
wc = WordCloud(background_color="black", max_words=2000, stopwords=STOPWORDS, max_font_size= 60,width=1000,height=1000)
wc.generate(" ".join(word))
plt.imshow(wc)
plt.axis('off')
fig=plt.gcf()
fig.set_size_inches(10,10)
plt.show()

In [10]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import seaborn
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

In [68]:
knn_ratings = ratings[["userId","on","rate"]]
knn_movies = movies[["on","tmdb"]]
knn_movie_ratings = pd.merge(knn_movies,knn_ratings,on="on")


In [69]:
movie_rating_count = (knn_movie_ratings.
                      groupby(by=["tmdb"])["rate"].
                      count().reset_index().
                      rename(columns={'rate':'totalRating'})
                      [["tmdb","totalRating"]]
                     )
movie_rating_count

Unnamed: 0,tmdb,totalRating
0,46639,1
1,60059,1
2,61889,6
3,61901,1
4,61923,4
...,...,...
270,158307,2
271,158415,2
272,194495,2
273,196810,2


In [70]:
rating_with_totalRatingCount=knn_movie_ratings.merge(movie_rating_count,left_on='tmdb',right_on='tmdb',how="inner")
rating_with_totalRatingCount.head()
rating_with_totalRatingCount.info()
rating_with_totalRatingCount.describe()
rating_with_totalRatingCount['totalRating'].count()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 820 entries, 0 to 819
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   on           820 non-null    object
 1   tmdb         820 non-null    int64 
 2   userId       820 non-null    object
 3   rate         820 non-null    int64 
 4   totalRating  820 non-null    int64 
dtypes: int64(3), object(2)
memory usage: 38.4+ KB


820

In [106]:
popularity_threshold=10
#rating_popular_book=rating_with_totalRatingCount.query('totalRatingCount>=@popularity_threshold')
rating_popular_movie=rating_with_totalRatingCount[rating_with_totalRatingCount['totalRating']>popularity_threshold]
rating_popular_movie.head()

Unnamed: 0,on,tmdb,userId,rate,totalRating
75,625c926c1b4a8150b1b28101,154490,6263e9d2fd2ee90bb8a2d0b6,3,49
76,625c926c1b4a8150b1b28101,154490,6263e9d2fd2ee90bb8a2d0b6,4,49
77,625c926c1b4a8150b1b28101,154490,6263e9d2fd2ee90bb8a2d0b6,2,49
78,625c926c1b4a8150b1b28101,154490,6263e9d2fd2ee90bb8a2d0b6,2,49
79,625c926c1b4a8150b1b28101,154490,6263e9d2fd2ee90bb8a2d0b6,3,49


In [107]:
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

combined = rating_popular_movie.merge(users,left_on='userId',right_on='_id',how="inner")
combined = combined.drop_duplicates(['userId','tmdb'])

In [108]:
knn_users = combined.pivot(index="tmdb",columns="_id",values="rate").fillna(0)
knn_rating_user_csr = csr_matrix(knn_users.values)

In [109]:
from sklearn.neighbors import NearestNeighbors
model_knn=NearestNeighbors(metric="cosine",algorithm="brute")
model_knn.fit(knn_rating_user_csr)

NearestNeighbors(algorithm='brute', metric='cosine')

In [110]:
query_index=4
distances,indices=model_knn.kneighbors(knn_users.iloc[query_index,:].values.reshape(1,-1),n_neighbors=5)
print("Distances -->",distances," Indices -->",indices)
 
print(distances.flatten())
print(len(distances.flatten()))
 
for i in range(0,len(distances.flatten())):
    if i==0:
        print("Recommendation for {0}:\n".format(knn_users.index[query_index]))
    else:
        print("{0}: {1}, with distance of {2}:".format(i,knn_users.index[indices.flatten()[i]],distances.flatten()[i]))

Distances --> [[0.         0.01869324 0.25956776 0.3513507  0.38616558]]  Indices --> [[4 3 5 1 6]]
[0.         0.01869324 0.25956776 0.3513507  0.38616558]
5
Recommendation for 78173:

1: 77184, with distance of 0.018693237074683733:
2: 91425, with distance of 0.2595677642771602:
3: 68267, with distance of 0.351350695101027:
4: 94810, with distance of 0.38616557702699883:


In [104]:
query_index=68267
new_shape = knn_users.reset_index()
values = new_shape[new_shape["tmdb"] == query_index].drop(columns=['tmdb']).values.reshape(1,-1)

distances,indices=model_knn.kneighbors(values,n_neighbors=5)
print("Distances -->",distances," Indices -->",indices)
 
print(distances.flatten())
print(len(distances.flatten()))
 
for i in range(0,len(distances.flatten())):
    if i==0:
        print("Recommendation for {0}:\n".format(query_index))
    else:
        print("{0}: {1}, with distance of {2}:".format(i,knn_users.index[indices.flatten()[i]],distances.flatten()[i]))

Distances --> [[0.         0.21476779 0.32828339 0.34277357 0.3513507 ]]  Indices --> [[1 6 9 5 4]]
[0.         0.21476779 0.32828339 0.34277357 0.3513507 ]
5
Recommendation for 68267:

1: 94810, with distance of 0.21476778908134353:
2: 123566, with distance of 0.32828338672015145:
3: 91425, with distance of 0.34277356680596194:
4: 78173, with distance of 0.351350695101027:


In [105]:
knn_users

_id,625495df8ca5e225e5473995,625499768ca5e225e5475a58,6254b1e98ca5e225e548c632,6254b3e18ca5e225e548ce58,6254c08d8ca5e225e5494015,62556c3b8ca5e225e549587e,6258a7df6ae20c911ad38f9f,625f35c0ed892b917c5c8be3,6263e8f3fd2ee90bb8a2c7f7,6263e96efd2ee90bb8a2cfee,6263e99bfd2ee90bb8a2d0ac,6263e9b3fd2ee90bb8a2d0b2,6263e9d2fd2ee90bb8a2d0b6,6263e9f2fd2ee90bb8a2d0bb,6263ea19fd2ee90bb8a2d0bf,6263ea32fd2ee90bb8a2d0c3,6263ea3efd2ee90bb8a2d0c7
tmdb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
65931,0.0,4.0,3.0,3.0,0.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
68267,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,0.0,3.0
74440,0.0,4.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
77184,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,2.0
78173,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,3.0
91425,0.0,0.0,0.0,3.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,5.0,0.0,5.0,0.0,0.0,3.0
94810,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
96203,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,3.0,0.0
115304,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123566,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,5.0,0.0,4.0,0.0,0.0,3.0


In [69]:
knn_users.iloc[query_index,:].values.reshape(1,-1)

array([[5., 0., 0., 0., 0., 4., 0.]])

In [67]:
knn_users.reset_index()[knn_users.reset_index()["tmdb"] == 719088]

_id,tmdb,625495df8ca5e225e5473995,625499768ca5e225e5475a58,6254b1e98ca5e225e548c632,6254b3e18ca5e225e548ce58,6254c08d8ca5e225e5494015,62556c3b8ca5e225e549587e,62582dc212014fc3392d66a1
90,719088,3.0,0.0,0.0,0.0,0.0,2.0,0.0


In [19]:
s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
genre_movies = movies.drop('genres', axis=1).join(s)

df = genre_movies
vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
m = vote_counts.quantile(0.85)

qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title','tmdb', 'year', 'vote_count', 'genre' , 'vote_average', 'popularity']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')

qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
qualified = qualified.sort_values('wr', ascending=False)
qualified = qualified

In [24]:
qualified.groupby(by="genre").sum("vote_average").sort_values(by=['vote_average'],ascending=False)

Unnamed: 0_level_0,tmdb,vote_count,vote_average,popularity,wr
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Drama,118818178,1644,344,31205.53,302.750176
Comedy,113532636,780,306,51948.972,254.051715
Animation,78484872,812,194,40159.238,176.619513
Crime,63263552,1582,194,31937.386,178.304616
Family,62187876,584,166,36267.788,146.636988
Romance,39926331,383,120,7177.831,102.351083
Action,37679810,793,109,15763.5,99.16799
Thriller,34697353,738,90,14918.536,83.260458
Fantasy,26615362,315,75,17253.867,67.690851
Adventure,23272207,236,66,9567.067,59.28442


In [20]:
import mongodb as md


In [22]:
md.read_mongo("finalyearproject","movies","online")

Unnamed: 0,_id,tmdb,title,overview,genres,vote_count,vote_average,popularity,release_date
0,624d882287b246f81c48c8b1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"[Crime, Drama, Crime, Drama]",0,0.000000,78.147,1994-09-23
1,624d882287b246f81c48c8b3,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","[Comedy, Drama, Romance, Drama, Comedy]",0,0.000000,29.896,1995-10-20
2,624d882287b246f81c48c8b5,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[Crime, Drama, Crime, Drama]",0,0.000000,91.215,1972-03-14
3,624d882287b246f81c48c8b7,652837,"Josee, the Tiger and the Fish","With dreams of diving abroad, Tsuneo gets a jo...","[Animation, Drama, Romance, Animation, Drama]",6,2.500000,30.509,2020-12-25
4,624d882287b246f81c48c8cf,533514,Violet Evergarden: The Movie,As the world moves on from the war and technol...,"[Animation, Fantasy, Drama, Romance, Animation...",9,1.555556,42.120,2020-09-18
...,...,...,...,...,...,...,...,...,...
1464,624d88d361dd768818c97646,16996,17 Again,"On the brink of a midlife crisis, 30-something...","[Comedy, Comedy]",0,0.000000,91.448,2009-03-11
1465,624d88d361dd768818c9763a,141052,Justice League,Fuelled by his restored faith in humanity and ...,"[Action, Adventure, Fantasy, Science Fiction]",0,0.000000,80.543,2017-11-15
1466,624d88d361dd768818c9763e,102899,Ant-Man,Armed with the astonishing ability to shrink i...,"[Action, Adventure, Science Fiction]",0,0.000000,86.827,2015-07-14
1467,624d88d361dd768818c97644,44912,Green Lantern,"For centuries, a small but powerful force of w...","[Action, Adventure, Thriller, Science Fiction]",0,0.000000,86.645,2011-06-16
