In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
import mongodb as md

import warnings; warnings.simplefilter('ignore')

In [8]:
movies = md.read_mongo("finalyearproject","series",False)
ratings = md.read_mongo("finalyearproject","reviews")
movies = movies.rename(columns={"_id":"on"})
# movie_ratings = pd.merge(movies,ratings,on="on")
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
#movies_ratings = movie_ratings.rename(columns={"title_x":"movieTitle","title_y":"rateTitle"})
movies


KeyError: 'release_date'

In [16]:
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()

m = vote_counts.quantile(0.95)


qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())][['tmdb','title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')

def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

qualified['wr'] = qualified.apply(weighted_rating, axis=1)

qualified = qualified.sort_values('wr', ascending=False)



In [4]:
s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
genre_movies = movies.drop('genres', axis=1).join(s)

def build_chart(genre, percentile=0.85):
    df = genre_movies[genre_movies['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified



In [4]:
movies['overview'] = movies['overview'].fillna("")
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['overview'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
moviesSIM = movies.reset_index()
titles = moviesSIM['title']
indices = pd.Series(moviesSIM.index, index=moviesSIM['title'])

In [11]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices]


In [14]:
user_movie_rating = movies_ratings.pivot_table(index='userId', columns='movieTitle', values='rate')
ratings_mean_count = pd.DataFrame(movies_ratings.groupby('movieTitle')['rate'].mean())
ratings_mean_count['rating_counts'] = pd.DataFrame(movies_ratings.groupby('movieTitle')['rate'].count())

def getCorelation(movieName):
    movieSelected = user_movie_rating[movieName]
    movieCorrelation = user_movie_rating.corrwith(movieSelected,method="pearson")
    df_movieCorrelation = pd.DataFrame(movieCorrelation, columns=['Correlation'])
    df_movieCorrelation.dropna(inplace=True)
    df_movieCorrelation = df_movieCorrelation.join(ratings_mean_count['rating_counts'])
    df_movieCorrelation[df_movieCorrelation ['rating_counts']>50].sort_values('Correlation', ascending=False).head()
    df_movieCorrelation = df_movieCorrelation.reset_index()
    return movieCorrelation

In [12]:
get_recommendations("Home Alone")

Unnamed: 0,on,tmdb,title,overview,genres,vote_count,vote_average,popularity,release_date,year
1848,61e5528f63580ca2ccf39a63,12536,Home Alone 4,Kevin McCallister's parents have split up. Now...,"[Comedy, Family]",0,0,55.278,2002-11-03,2002
2295,61e5529c63580ca2ccf39fa1,1813,The Devil's Advocate,Aspiring Florida defense lawyer Kevin Lomax ac...,"[Drama, Mystery, Horror, Thriller]",0,0,40.963,1997-10-17,1997
853,61e5527763580ca2ccf38ebd,772,Home Alone 2: Lost in New York,"Instead of flying to Florida with his folks, K...","[Adventure, Comedy, Family]",0,0,86.752,1992-11-19,1992
8632,61e5534663580ca2ccf3e9e7,492141,Snowed Inn Christmas,Jenna Hudson and Kevin Jenner are polar opposi...,"[Comedy, Romance, TV Movie]",0,0,10.486,2017-12-12,2017
780,61e5527663580ca2ccf38def,381288,Split,Though Kevin has evidenced 23 personalities to...,"[Horror, Thriller]",0,0,100.812,2017-01-19,2017
814,61e5527663580ca2ccf38e47,12153,White Chicks,"Two FBI agent brothers, Marcus and Kevin Copel...","[Crime, Comedy]",0,0,99.899,2004-06-23,2004
3752,61e552c563580ca2ccf3b0b7,4379,Monster-in-Law,Office temp Charlotte Cantilini thinks she's f...,"[Comedy, Romance]",0,0,19.91,2005-05-13,2005
5683,61e552f763580ca2ccf3c767,9656,Black Christmas,An escaped maniac returns to his childhood hom...,"[Mystery, Horror, Thriller]",0,0,20.054,2006-12-15,2006
10918,62040bd82cbdc478c6c7a3cb,317091,November Criminals,When Addison investigates the murder of his fr...,"[Crime, Drama, Mystery, Thriller]",0,0,15.791,2017-11-07,2017
9127,61e5535863580ca2ccf3efb9,36819,Time Bandits,Young history buff Kevin can scarcely believe ...,"[Adventure, Comedy, Family, Fantasy, Science F...",0,0,17.412,1981-07-13,1981


In [13]:
import pandas as pd
from pymongo import MongoClient

def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s?authSource=admin&retryWrites=true&ssl=true' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]

def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """


    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    if (collection == "movies" or collection == "series"):
        pipeline = [
            {"$lookup":{"from": 'genres',"localField": 'genre_ids',"foreignField": 'tmdb',"as": 'genres'}},
            {"$project":{"_id":"$_id","tmdb":"$tmdb","title": "$title","overview": "$overview","genres":"$genres.name","vote_count":"$vote_count","vote_average":"$vote_average","popularity": "$popularity","release_date": "$release_date",}}
        ]
        cursor = db[collection].aggregate(pipeline)
    else:
        cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    return df




In [28]:
def get_database():
    from pymongo import MongoClient
    import pymongo

    # Provide the mongodb atlas url to connect python to mongodb using pymongo
    CONNECTION_STRING = "mongodb://fyp:fyp_admin@cluster0-shard-00-02.p0vx1.mongodb.net:27017/?ssl=true&replicaSet=atlas-iixcb4-shard-0&authSource=admin&retryWrites=true&w=majority"

    # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
    from pymongo import MongoClient
    client = MongoClient(CONNECTION_STRING)

    # Create the database for our example (we will use the same database throughout the tutorial
    return client['finalyearproject']

In [29]:
get_database()

Database(MongoClient(host=['cluster0-shard-00-02.p0vx1.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, replicaset='atlas-iixcb4-shard-0', authsource='admin', retrywrites=True, w='majority', tls=True), 'finalyearproject')

In [30]:
db = get_database()

In [31]:
list(db["genres"].find())

ServerSelectionTimeoutError: cluster0-shard-00-02.p0vx1.mongodb.net:27017: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129), Timeout: 30s, Topology Description: <TopologyDescription id: 621146b67f053579de33c662, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('cluster0-shard-00-02.p0vx1.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('cluster0-shard-00-02.p0vx1.mongodb.net:27017: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')>]>