In [27]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
import mongodb as md

import warnings; warnings.simplefilter('ignore')

In [28]:
movies = md.read_mongo("finalyearproject","movies",False)
ratings = md.read_mongo("finalyearproject","reviews")
movies = movies.rename(columns={"_id":"on"})
movie_ratings = pd.merge(movies,ratings,on="on")
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
movies_ratings = movie_ratings.rename(columns={"title_x":"movieTitle","title_y":"rateTitle"})


In [29]:
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()

m = vote_counts.quantile(0.95)


qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())][['tmdb','title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')

def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

qualified['wr'] = qualified.apply(weighted_rating, axis=1)

qualified = qualified.sort_values('wr', ascending=False)



In [30]:
s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
genre_movies = movies.drop('genres', axis=1).join(s)

def build_chart(genre, percentile=0.85):
    df = genre_movies[genre_movies['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False)
    
    return qualified



In [31]:
movies['overview'] = movies['overview'].fillna("")
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['overview'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
moviesSIM = movies.reset_index()
titles = moviesSIM['title']
indices = pd.Series(moviesSIM.index, index=moviesSIM['title'])

In [32]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices]


In [33]:
user_movie_rating = movies_ratings.pivot_table(index='userId', columns='movieTitle', values='rate')
ratings_mean_count = pd.DataFrame(movies_ratings.groupby('movieTitle')['rate'].mean())
ratings_mean_count['rating_counts'] = pd.DataFrame(movies_ratings.groupby('movieTitle')['rate'].count())

def getCorelation(movieName):
    movieSelected = user_movie_rating[movieName]
    movieCorrelation = user_movie_rating.corrwith(movieSelected,method="pearson")
    df_movieCorrelation = pd.DataFrame(movieCorrelation, columns=['Correlation'])
    df_movieCorrelation.dropna(inplace=True)
    df_movieCorrelation = df_movieCorrelation.join(ratings_mean_count['rating_counts'])
    df_movieCorrelation = df_movieCorrelation[df_movieCorrelation ['rating_counts']>0].sort_values('Correlation', ascending=False).head()
    df_movieCorrelation = df_movieCorrelation.reset_index()
    return df_movieCorrelation

In [34]:
user_movie_rating

movieTitle,Avengers: Infinity War,Encanto,Fistful of Vengeance,Pirates of the Caribbean: On Stranger Tides,Scream,Spider-Man: No Way Home,The Batman,The Ice Age Adventures of Buck Wild
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
62533a094132df503392986b,5.0,2.0,,5.0,4.0,4.333333,5.0,
62535464fccc45e2de00689e,4.0,5.0,,,1.0,1.0,,4.0
6253626afccc45e2de037503,,4.0,5.0,,3.0,5.0,3.0,
62536950fccc45e2de127c54,,3.0,,,,,,5.0
62536a2afccc45e2de1599a0,,,,,,,,5.0


In [42]:
getCorelation("Scream")

Unnamed: 0,movieTitle,Correlation,rating_counts
0,Avengers: Infinity War,1.0,4
1,Scream,1.0,4
2,The Batman,1.0,2
3,Spider-Man: No Way Home,0.882498,5
4,Encanto,-0.928571,6


In [36]:
qualified.dropna()

Unnamed: 0,tmdb,title,year,vote_count,vote_average,popularity,genres,wr
7,890656,Fistful of Vengeance,2022,1,5,1594.013,"[Crime, Action, Crime, Fantasy]",5.0
205,1865,Pirates of the Caribbean: On Stranger Tides,2011,1,5,182.133,"[Action, Adventure, Fantasy]",5.0
1,414906,The Batman,2022,2,4,3827.658,"[Mystery, Crime, Crime, Mystery, Thriller]",4.0
14,774825,The Ice Age Adventures of Buck Wild,2022,3,4,1431.307,"[Animation, Comedy, Family, Adventure, Animati...",4.0
71,299536,Avengers: Infinity War,2018,4,4,338.402,"[Action, Adventure, Science Fiction]",4.0
0,568124,Encanto,2021,6,3,2402.201,"[Animation, Comedy, Family, Animation, Comedy,...",3.0
3,634649,Spider-Man: No Way Home,2021,5,3,5083.954,"[Action, Adventure, Science Fiction]",3.0
9,646385,Scream,2022,4,3,1675.161,"[Mystery, Horror, Mystery, Thriller]",3.0


In [37]:
df_genre = build_chart("Animation")

In [38]:
build_chart("Action")

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
7,Fistful of Vengeance,2022,1,5,1594.013,5.0
205,Pirates of the Caribbean: On Stranger Tides,2011,1,5,182.133,5.0
71,Avengers: Infinity War,2018,4,4,338.402,4.0
3,Spider-Man: No Way Home,2021,5,3,5083.954,3.0
4,The King's Man,2021,0,0,1895.511,
...,...,...,...,...,...,...
717,Kill Bill: Vol. 1,2003,0,0,40.954,
728,Akira,1988,0,0,50.144,
740,Justice League: The Flashpoint Paradox,2013,0,0,47.571,
748,Rurouni Kenshin: The Final,2021,0,0,116.225,
