In [17]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
import mongodb as md

import warnings; warnings.simplefilter('ignore')

In [18]:
#movies = md.read_mongo("finalyearproject","movies")

movies = pd.read_csv("movies_metadata.csv")
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
movies = movies.drop([19730, 29503, 35587])
movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


small_links = pd.read_csv("small_links.csv")
ratings = pd.read_csv("ratings_small.csv")
movies = movies.rename(columns={'id':'tmdbId'})
movies["tmdbId"] = movies["tmdbId"].astype("int")
movies_small = pd.merge(movies,small_links)
movies_small_ratings = pd.merge(movies_small,ratings)




In [19]:
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()

m = vote_counts.quantile(0.95)

qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())][['tmdbId','title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')


def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

qualified['wr'] = qualified.apply(weighted_rating, axis=1)

qualified = qualified.sort_values('wr', ascending=False)



In [20]:
s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
genre_movies = movies.drop('genres', axis=1).join(s)

def build_chart(genre, percentile=0.85):
    df = genre_movies[genre_movies['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified



In [21]:
movies['overview'] = movies['overview'].fillna("")
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['overview'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
moviesSIM = movies.reset_index()
titles = moviesSIM['title']
indices = pd.Series(moviesSIM.index, index=moviesSIM['title'])

In [6]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices]


In [15]:
user_movie_rating = movies_small_ratings.pivot_table(index='userId', columns='title', values='rating')
ratings_mean_count = pd.DataFrame(movies_small_ratings.groupby('title')['rating'].mean())
ratings_mean_count['rating_counts'] = pd.DataFrame(movies_small_ratings.groupby('title')['rating'].count())

def getCorelation(movieName):
    movieSelected = user_movie_rating[movieName]
    movieCorrelation = user_movie_rating.corrwith(movieSelected,method="pearson")
    df_movieCorrelation = pd.DataFrame(movieCorrelation, columns=['Correlation'])
    df_movieCorrelation.dropna(inplace=True)
    df_movieCorrelation = df_movieCorrelation.join(ratings_mean_count['rating_counts'])
    df_movieCorrelation = df_movieCorrelation[df_movieCorrelation ['rating_counts']>50].sort_values('Correlation', ascending=False).head()
    df_movieCorrelation = df_movieCorrelation.reset_index()
    return df_movieCorrelation

In [16]:
getCorelation("Toy Story")

Unnamed: 0,title,Correlation,rating_counts
0,Toy Story,1.0,247
1,Toy Story 2,0.743352,125
2,A Bug's Life,0.677299,105
3,The Birds,0.62028,51
4,Basic Instinct,0.613613,56


In [None]:
get_recommendations("Inception")

In [None]:
vote_counts

In [4]:
movies = md.read_mongo("finalyearproject","movies")

In [8]:
movies = ""
ratings = ""
movies = md.read_mongo("finalyearproject","movies")
ratings = md.read_mongo("finalyearproject","reviews")
movies = movies.rename(columns={"_id":"on"})
movie_ratings = pd.merge(movies,ratings,on="on")

Unnamed: 0,on,tmdb,title_x,overview,genres,vote_count,vote_average,popularity,release_date,_id,userId,title_y,rate,body,onModel,createdAt,updatedAt,__v
0,61e55216d20382a072e3d5a8,438695,Sing 2,Buster and his new cast now have their sights ...,"[Comedy, Animation, Family, Music]",1,3,9205.508,2021-12-01,61f8466bb604dfe15823568c,61f79b58b7644a60feebb9f2,Photorgrapht,3,Test test test,Movies,2022-01-31 20:28:27.667,2022-01-31 20:28:27.667,0
1,61e55216d20382a072e3d5bc,787310,Survive the Game,When cop David is injured in a drug bust gone ...,"[Action, Crime, Thriller]",1,5,2005.829,2021-10-08,61f83b7416e6ab33c09d5eb0,61f79b58b7644a60feebb9f2,Photorgrapht,5,Test test test,Movies,2022-01-31 19:41:40.010,2022-01-31 19:41:40.010,0


In [10]:
ratings

Unnamed: 0,_id,userId,title,rate,body,on,onModel,createdAt,updatedAt,__v
0,61f83b7416e6ab33c09d5eb0,61f79b58b7644a60feebb9f2,Photorgrapht,5,Test test test,61e55216d20382a072e3d5bc,Movies,2022-01-31 19:41:40.010,2022-01-31 19:41:40.010,0
1,61f8466bb604dfe15823568c,61f79b58b7644a60feebb9f2,Photorgrapht,3,Test test test,61e55216d20382a072e3d5a8,Movies,2022-01-31 20:28:27.667,2022-01-31 20:28:27.667,0


In [11]:
movies

Unnamed: 0,on,tmdb,title,overview,genres,vote_count,vote_average,popularity,release_date
0,61e55216d20382a072e3d5a8,438695,Sing 2,Buster and his new cast now have their sights ...,"[Comedy, Animation, Family, Music]",1,3,9205.508,2021-12-01
1,61e55216d20382a072e3d5aa,524434,Eternals,The Eternals are a team of ancient aliens who ...,"[Action, Adventure, Fantasy, Science Fiction]",0,0,11902.600,2021-11-03
2,61e55216d20382a072e3d5ae,425909,Ghostbusters: Afterlife,When a single mom and her two kids arrive in a...,"[Adventure, Comedy, Fantasy]",0,0,7497.179,2021-11-11
3,61e55216d20382a072e3d5b0,568124,Encanto,"The tale of an extraordinary family, the Madri...","[Comedy, Animation, Family, Fantasy]",0,0,6462.935,2021-11-24
4,61e55216d20382a072e3d5b2,585083,Hotel Transylvania: Transformania,"When Van Helsing's mysterious invention, the ""...","[Adventure, Comedy, Animation, Family, Fantasy]",0,0,3619.893,2022-01-13
...,...,...,...,...,...,...,...,...,...
10546,61e5bd1f7ca18299ae1bbf21,10397,Angela's Ashes,Based on the best selling autobiography by Iri...,[Drama],0,0,10.255,1999-12-25
10547,61e5bd1f7ca18299ae1bbf23,259954,5 to 7,A young writer begins an affair with an older ...,"[Comedy, Romance]",0,0,7.164,2014-04-19
10548,61e5bd1f7ca18299ae1bbf25,329819,Standing Tall,The film tells the story of Malony and his edu...,[Drama],0,0,6.540,2015-05-13
10549,61e5bd1f7ca18299ae1bbf27,10886,The Unknown Woman,"Irena, a Ukrainian woman coming to Italy looki...",[Thriller],0,0,8.734,2006-10-09
