In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import sys

In [4]:
print(f'[{datetime.now().time()}] Loading data...')
md = pd.read_csv('./Data/movies_metadata.csv', low_memory=False)
cred = pd.read_csv('./Data/credits.csv', low_memory=False)
key = pd.read_csv('./Data/keywords.csv', low_memory=False)

[13:02:59.752742] Loading data...


In [5]:
print(f'[{datetime.now().time()}] Qualifying data...')
md = md[(md['original_language'] == 'en')]
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype(int)
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype(int)
C = vote_averages.mean()
m = vote_counts.quantile(0.75)

[13:03:07.667015] Qualifying data...


In [6]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [7]:
qual = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['id', 'title', 'year', 'release_date', 'vote_count', 'vote_average', 'popularity', 'genres', 'overview', 'budget', 'revenue']]
qual['vote_count'] = qual['vote_count'].astype(int)
qual['vote_average'] = qual['vote_average'].astype(int)
qual.shape
print('-'*45)
print(f'Movies are filtered to English language only, and then those with fewer than {m} total votes are excluded, resulting in {len(qual.index)} movies.')
print('-'*45)

---------------------------------------------
Movies are filtered to English language only, and then those with fewer than 43.0 total votes are excluded, resulting in 8083 movies.
---------------------------------------------


In [8]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [9]:
qual['wgt_rating'] = qual.apply(weighted_rating, axis=1)
qual = qual.sort_values('wgt_rating', ascending=False)

In [10]:
qual['id'] = qual['id'].astype(int)
cred['id'] = cred['id'].astype(int)
key['id'] = key['id'].astype(int)

df = qual.merge(cred,on='id')
df = df.merge(key,on='id')
df = df[~df['id'].duplicated(keep='first')]
df = df[(df['cast'].notnull()) & (df['crew'].notnull()) & (df['keywords'].notnull()) & (df['genres'].notnull())]

In [11]:
# Parse the stringified features into their corresponding python objects
print(f'[{datetime.now().time()}] Parsing features...')
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

[13:03:25.729121] Parsing features...


In [12]:
# Parse the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return ''

In [13]:
# Return the list top 3 elements or entire list; whichever is greater.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        # Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    # Return empty list in case of missing/malformed data
    return []

In [14]:
# Define new director, cast, genre and keyword features that are in a suitable form.
df['director'] = df['crew'].apply(get_director)
df['director'] = df['director'].apply(lambda x: [x,x,x])

features = ['cast', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(get_list)

In [16]:
s = df.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]

In [17]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [18]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [19]:
df['keywords'] = df['keywords'].apply(filter_keywords)
df['keywords'] = df['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])

In [20]:
# Convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

print(f'[{datetime.now().time()}] Cleaning data...')
# Apply clean_data function to features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

[13:05:14.911787] Cleaning data...


In [21]:
print(f'[{datetime.now().time()}] Creating metadata soup...')
df['soup'] = df['keywords'] + df['cast'] + df['director'] + df['genres']
df['soup'] = df['soup'].apply(lambda x: ' '.join(x))

[13:05:32.295577] Creating metadata soup...


In [22]:
# Use CountVectorizer() instead of TF-IDF because we do not want to down-weight the presence of an actor/director if he or she has acted or directed in relatively more movies.
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

print(f'[{datetime.now().time()}] Creating count matrix...')
count = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
count_matrix = count.fit_transform(df['soup'])

[13:05:42.822550] Creating count matrix...


In [23]:
# Compute the cosine similarity matrix based on count_matrix
print(f'[{datetime.now().time()}] Computing cosine similarity matrix...')
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

[13:05:54.890143] Computing cosine similarity matrix...


In [25]:
from fuzzywuzzy import fuzz

# Checks if movie exists and returns movie id
def search_movies(title):
    query_idx = None
    ratio_tuples = []
    
    for idx, val in enumerate(df.title):
        ratio = fuzz.ratio(val.lower(), title.lower())
        if ratio >= 75:
            date = df.iloc[idx]['release_date']
            ratio_tuples.append((val, ratio, idx, date))
    
    ratio_tuples = sorted(ratio_tuples, key=lambda x: x[1], reverse=True)
    
    if len(ratio_tuples) > 0:
        print(f'Possible matches for "{title}": ')
        print()
        print('IDX || TITLE || RELEASE DATE || MATCH SCORE')
        print('-'*45)
        for x in ratio_tuples:
            print(f'{x[2]} || {x[0]} || {x[3]} ||  {x[1]}%')
        print('-'*45)
        return 'n'
    else:
        print(f'The movie {title} does not exist in the dataset, please try a different search.')
        print('-'*45)

In [26]:
# Takes in movie id as input and outputs most similar movies
def get_recommendations(idx, cosine_sim=cosine_sim):

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    #return df['title'].iloc[movie_indices]
    print(f'Top 10 recommended movies based on key features from {df.iloc[idx]["title"]}: ')
    print()
    print('TITLE || RELEASE DATE || SIMILARITY SCORE')
    print('-'*45)
    for i in sim_scores:
        val = int(i[1]*10)
        score = int(i[1]*100)
        date = df.iloc[idx]['release_date']
        print(f'{"*"*val} {df["title"].iloc[i[0]]} || {date} || {score}%')
    print('-'*45)

In [27]:
again = 'y'
while again == 'y':
    search = 'y'
    while search == 'y':
        print('-'*45)
        movie = input('Enter a movie name or search term to return a list of closest matches from the dataset: ') 
        print('-'*45)
        if search_movies(movie) == 'n':
            search = 'n'    
    cont = 'y'
    while cont == 'y':
        movie_idx = input('Please enter the idx value of one of the matching movies above to receive recommendations: ')
        print('-'*45)
        try:
            get_recommendations(int(movie_idx))
            cont = 'n'
        except ValueError:
            cont = input('The idx value entered is not valid. Try again? [y] or [n] ')
    again = input('Get a different movie recommendation? [y] or [n] ')

---------------------------------------------
Enter a movie name or search term to return a list of closest matches from the dataset: dumb and dumber
---------------------------------------------
Possible matches for "dumb and dumber": 

IDX || TITLE || RELEASE DATE || MATCH SCORE
---------------------------------------------
1990 || Dumb and Dumber || 1994-12-16 ||  100%
7360 || Dumb and Dumber To || 2014-11-12 ||  91%
---------------------------------------------
Please enter the idx value of one of the matching movies above to receive recommendations: 1990
---------------------------------------------
Top 10 recommended movies based on key features from Dumb and Dumber: 

TITLE || RELEASE DATE || SIMILARITY SCORE
---------------------------------------------
* Dumb and Dumber To || 1994-12-16 || 14%
 Yes Man || 1994-12-16 || 7%
 Because of Winn-Dixie || 1994-12-16 || 7%
 Once Bitten || 1994-12-16 || 7%
 101 Dalmatians || 1994-12-16 || 7%
 The Incredible Burt Wonderstone || 1994-12-1

In [29]:
id_map = pd.read_csv('./Data/links.csv')[['movieId', 'tmdbId']]
id_map.columns = ['movieId', 'id']
id_map.head()

Unnamed: 0,movieId,id
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0


In [30]:
import pygsheets

sht_id = '18TobRWW8KKtEbiM_8JCPSdp28uLOmOTKLeXoL4w9nMI'
sht_name = 'Ratings 100k'

gc = pygsheets.authorize(service_file=r'C:\Users\WOODS4\Documents\DSBC\Project_3\MovieLens\drive_creds.json')
sht = gc.open_by_key(sht_id)
wks = sht.worksheet_by_title(sht_name)

ratings = wks.get_as_df()
ratings = ratings.iloc[:, :3]
ratings = ratings.merge(id_map, on='movieId')
ratings = ratings.dropna()
ratings['id'] = ratings['id'].astype(int)
print(ratings.shape)
ratings.head()

(99903, 4)


Unnamed: 0,userId,movieId,rating,id
0,672,109487,4.0,157336
1,15,109487,3.5,157336
2,40,109487,4.5,157336
3,42,109487,4.0,157336
4,48,109487,3.5,157336


In [32]:
rating = input('Would you like to enter a movie rating? [y] or [n] ')
if rating == 'y':
    user_id = input(f'Please enter your user id: ')
while rating == 'y':
    search = 'y'
    while search == 'y':
        print('-'*45)
        movie = input('Enter a movie name or search term to return a list of closest matches from the dataset: ') 
        print('-'*45)
        if search_movies(movie) == 'n':
            search = 'n'    
    cont = 'y'
    while cont == 'y':
        df_idx = input('Please enter the idx value of one of the matching movies above: ')
        movie_id = id_map.loc[id_map['id'] == df.iloc[int(df_idx)]['id']]['movieId'].values[0].item()
        movie_title = df.iloc[int(df_idx)]["title"]
        print('-'*45)
        movie_rating = input(f'Please enter a rating between 1-5 for {movie_title}: ')
        print('-'*45)
        try:
            wks.insert_rows(row=1, number=1)
            wks.update_value('A2',user_id)
            wks.update_value('B2',movie_id)
            wks.update_value('C2',float(movie_rating))
            ratings = wks.get_as_df()
            ratings = ratings.iloc[:, :3]
            cont = 'n'
        except ValueError:
            cont = input('The idx value entered is not valid. Try again? [y] or [n] ')
    rating = input('Enter another movie rating? [y] or [n] ')

Would you like to enter a movie rating? [y] or [n] n


In [33]:
print(ratings.shape)
ratings.head(10)

(100010, 3)


Unnamed: 0,userId,movieId,rating
0,672,231,5.0
1,672,109487,4.0
2,672,2959,5.0
3,672,7458,4.5
4,672,3578,5.0
5,672,79132,5.0
6,1,31,2.5
7,1,1029,3.0
8,1,1061,3.0
9,1,1129,2.0


In [34]:
from surprise import Reader, Dataset, SVD, evaluate

reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5), skip_lines=1)
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

In [35]:
# Evaluate SVD algorithm based on RMSE
algo = SVD()
evaluate(algo, data, measures=['RMSE'])



Evaluating RMSE of algorithm SVD.

------------
Fold 1
RMSE: 0.9006
------------
Fold 2
RMSE: 0.8916
------------
Fold 3
RMSE: 0.9071
------------
Fold 4
RMSE: 0.9016
------------
Fold 5
RMSE: 0.8903
------------
------------
Mean RMSE: 0.8982
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.9006078711309423,
                             0.8915870465774142,
                             0.9070603664468666,
                             0.9015843147224897,
                             0.8903238606077665]})

In [36]:
# Train SVD algorithm on movie ratings data
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x29488a28b38>

In [37]:
# Predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [38]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [39]:
top_n = get_top_n(predictions, n=5)

In [40]:
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

299 [1193, 1217, 904, 899, 969]
73 [1247, 908, 912, 1250, 1219]
90 [969, 1276, 858, 4993, 750]
187 [898, 6016, 1217, 1283, 1394]
519 [994, 745, 2692, 1217, 69844]
463 [926, 1252, 68157, 1299, 1136]
553 [1217, 527, 912, 44665, 1233]
101 [905, 1221, 858, 2542, 48780]
15 [115122, 1960, 94466, 954, 82459]
563 [50, 858, 1228, 905, 318]
166 [318, 2571, 4226, 1252, 1060]
324 [3462, 1217, 969, 858, 898]
442 [1276, 969, 1228, 908, 968]
593 [2571, 318, 527, 3462, 1259]
328 [1252, 6016, 1299, 926, 7502]
380 [905, 923, 1213, 106782, 1203]
118 [912, 7153, 926, 318, 5952]
558 [1217, 318, 1252, 3462, 905]
433 [912, 1945, 1204, 1252, 953]
103 [296, 1945, 858, 608, 318]
525 [5952, 7153, 745, 527, 7502]
127 [899, 1276, 969, 3462, 1219]
34 [318, 3022, 3462, 1199, 6787]
341 [1203, 50, 858, 3462, 593]
652 [50, 858, 951, 1217, 318]
600 [608, 527, 318, 1228, 926]
199 [3435, 899, 1221, 994, 923]
220 [1276, 922, 2028, 1207, 7153]
245 [318, 58559, 88125, 7502, 475]
17 [741, 1219, 55820, 1060, 3167]
26 [969, 899

In [170]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [171]:
id_map = pd.read_csv('./Data/links.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(df[['title', 'id']], on='id').set_index('title')

In [172]:
indices_map = id_map.set_index('id')

In [173]:
def hybrid(userId, title):
    indices = pd.Series(df.index, index=df['title'])
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = df.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)