In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
import mongodb as md

import warnings; warnings.simplefilter('ignore')

In [11]:
movies = md.read_mongo("finalyearproject","movies",True)
ratings = md.read_mongo("finalyearproject","reviews")
movies = movies.rename(columns={"_id":"on"})
# movie_ratings = pd.merge(movies,ratings,on="on")
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
#movies_ratings = movie_ratings.rename(columns={"title_x":"movieTitle","title_y":"rateTitle"})
movies


Unnamed: 0,on,tmdb,title,overview,genres,vote_count,vote_average,popularity,release_date,year
0,624d882287b246f81c48c8b1,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,[],21091,8.7,78.147,1994-09-23,1994
1,624d882287b246f81c48c8b3,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",[],3546,8.7,29.896,1995-10-20,1995
2,624d882287b246f81c48c8b5,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",[],15761,8.7,91.215,1972-03-14,1972
3,624d882287b246f81c48c8b7,652837,"Josee, the Tiger and the Fish","With dreams of diving abroad, Tsuneo gets a jo...",[],204,8.6,30.509,2020-12-25,2020
4,624d882287b246f81c48c8cf,533514,Violet Evergarden: The Movie,As the world moves on from the war and technol...,[],212,8.6,42.120,2020-09-18,2020
...,...,...,...,...,...,...,...,...,...,...
1464,624d88d361dd768818c9763a,141052,Justice League,Fuelled by his restored faith in humanity and ...,[],11249,6.1,80.543,2017-11-15,2017
1465,624d88d361dd768818c97642,373571,Godzilla: King of the Monsters,Follows the heroic efforts of the crypto-zoolo...,[],4740,6.7,88.155,2019-05-29,2019
1466,624d88d361dd768818c9763e,102899,Ant-Man,Armed with the astonishing ability to shrink i...,[],16897,7.1,86.827,2015-07-14,2015
1467,624d88d361dd768818c97644,44912,Green Lantern,"For centuries, a small but powerful force of w...",[],6130,5.2,86.645,2011-06-16,2011


In [12]:
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()

m = vote_counts.quantile(0.95)


qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())][['tmdb','title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')

def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

qualified['wr'] = qualified.apply(weighted_rating, axis=1)

qualified = qualified.sort_values('wr', ascending=False)



In [13]:
s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
genre_movies = movies.drop('genres', axis=1).join(s)

def build_chart(genre, percentile=0.85):
    df = genre_movies[genre_movies['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified



In [14]:
movies['overview'] = movies['overview'].fillna("")
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['overview'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
moviesSIM = movies.reset_index()
titles = moviesSIM['title']
indices = pd.Series(moviesSIM.index, index=moviesSIM['title'])

In [15]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices]


In [7]:
user_movie_rating = movies_ratings.pivot_table(index='userId', columns='movieTitle', values='rate')
ratings_mean_count = pd.DataFrame(movies_ratings.groupby('movieTitle')['rate'].mean())
ratings_mean_count['rating_counts'] = pd.DataFrame(movies_ratings.groupby('movieTitle')['rate'].count())

def getCorelation(movieName):
    movieSelected = user_movie_rating[movieName]
    movieCorrelation = user_movie_rating.corrwith(movieSelected,method="pearson")
    df_movieCorrelation = pd.DataFrame(movieCorrelation, columns=['Correlation'])
    df_movieCorrelation.dropna(inplace=True)
    df_movieCorrelation = df_movieCorrelation.join(ratings_mean_count['rating_counts'])
    df_movieCorrelation[df_movieCorrelation ['rating_counts']>50].sort_values('Correlation', ascending=False).head()
    df_movieCorrelation = df_movieCorrelation.reset_index()
    return movieCorrelation

NameError: name 'movies_ratings' is not defined

In [None]:
get_recommendations("Home Alone")

In [None]:
import pandas as pd
from pymongo import MongoClient

def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s?authSource=admin&retryWrites=true&ssl=true' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]

def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """


    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    if (collection == "movies" or collection == "series"):
        pipeline = [
            {"$lookup":{"from": 'genres',"localField": 'genre_ids',"foreignField": 'tmdb',"as": 'genres'}},
            {"$project":{"_id":"$_id","tmdb":"$tmdb","title": "$title","overview": "$overview","genres":"$genres.name","vote_count":"$vote_count","vote_average":"$vote_average","popularity": "$popularity","release_date": "$release_date",}}
        ]
        cursor = db[collection].aggregate(pipeline)
    else:
        cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    return df




In [None]:
def get_database():
    from pymongo import MongoClient
    import pymongo

    # Provide the mongodb atlas url to connect python to mongodb using pymongo
    CONNECTION_STRING = "mongodb://fyp:fyp_admin@cluster0-shard-00-02.p0vx1.mongodb.net:27017/?ssl=true&replicaSet=atlas-iixcb4-shard-0&authSource=admin&retryWrites=true&w=majority"

    # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
    from pymongo import MongoClient
    client = MongoClient(CONNECTION_STRING)

    # Create the database for our example (we will use the same database throughout the tutorial
    return client['finalyearproject']

In [None]:
get_database()

In [None]:
db = get_database()

In [None]:
list(db["genres"].find())

In [16]:
get_recommendations("The Shawshank Redemption")

Unnamed: 0,on,tmdb,title,overview,genres,vote_count,vote_average,popularity,release_date,year
157,624d883587b246f81c48cb31,992,Sherlock Jr.,"A film projectionist longs to be a detective, ...",[],655,8.2,16.268,1924-04-17,1924
686,624d886b87b246f81c48d0b7,525,The Blues Brothers,"Jake Blues, just released from prison, puts hi...",[],3331,7.7,26.253,1980-06-16,1980
721,624d886c87b246f81c48d123,993,Sleuth,A mystery novelist devises an insurance scam w...,[],440,7.7,10.78,1972-12-10,1972
679,624d886a87b246f81c48d091,903,Cool Hand Luke,When petty criminal Luke Jackson is sentenced ...,[],1049,7.8,17.653,1967-06-22,1967
838,624d889561dd768818c96f3a,811592,One Shot,"An elite squad of Navy SEALs, on a covert miss...",[],491,6.8,432.493,2021-11-05,2021
1240,624d88ca61dd768818c9747d,10193,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven...",[],12355,7.8,103.588,2010-06-16,2010
759,624d886e87b246f81c48d17f,5528,The Chorus,"Set in 1940s France, a new teacher at a school...",[],1954,7.7,16.875,2004-03-17,2004
119,624d882787b246f81c48ca9d,29259,Le Trou,Four prison inmates have been hatching a plan ...,[],308,8.2,14.584,1960-03-18,1960
251,624d883e87b246f81c48cc73,458220,Palmer,"After 12 years in prison, former high school f...",[],760,8.1,38.597,2021-01-29,2021
1403,624d88d161dd768818c975c4,136797,Need for Speed,The film revolves around a local street-racer ...,[],3555,6.3,84.161,2014-03-12,2014


In [None]:
movies = ""
movies = md.read_mongo("finalyearproject","movies",True)
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

movies['overview'] = movies['overview'].fillna("")
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['overview'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
moviesSIM = movies.reset_index()
titles = moviesSIM['title']
indices = pd.Series(moviesSIM.index, index=moviesSIM['title'])
idx = indices[title]
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:5]
movie_indices = [i[0] for i in sim_scores]