<a href="https://colab.research.google.com/github/MSSfusiqi/Movie-Recommender-system/blob/master/movie_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preloading

## Link Google Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Install Packages

In [0]:
! pip uninstall -y tensorflow
! pip install -q -U tensorflow-gpu==1.15.0

Uninstalling tensorflow-1.15.0:
  Successfully uninstalled tensorflow-1.15.0
[K     |████████████████████████████████| 377.0MB 46kB/s 
[K     |████████████████████████████████| 3.2MB 24.9MB/s 
[K     |████████████████████████████████| 491kB 50.7MB/s 
[?25h

## Import Packages

In [0]:
import numpy as np
import pandas as pd
from ast import literal_eval
from scipy.sparse import csr_matrix

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from sklearn.model_selection import train_test_split


# Import Keras libraries
import tensorflow as tf
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.layers import Embedding, Reshape, dot, Input
import keras

Using TensorFlow backend.


## Assign GPU 

In [0]:
tf.config.experimental.list_physical_devices('GPU') 

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [0]:
import os
from keras import backend as K

tf.config.experimental.list_physical_devices('GPU') 
# Assign GPU
os.environ['CUDA_VISIBLE_DEVICES'] ='0'
#CONFIG MEMORY USAGE
config = tf.ConfigProto() #device_count = {'GPU':1} 
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
K.set_session(session)

# Exploration on small dataset

In [0]:
tmdb_5000_movies = pd.read_csv('/content/drive/My Drive/tmdb_5000_movies.csv.zip')
tmdb_5000_credits = pd.read_csv('/content/drive/My Drive/tmdb_5000_credits.csv.zip')

# Parse the stringified features into their corresponding python objects
f_clean_movies = ['genres','production_companies','production_countries','spoken_languages','keywords']
f_clean_credits = ['cast','crew']
for f in f_clean_movies:
    tmdb_5000_movies[f] = tmdb_5000_movies[f].apply(literal_eval)
for f in f_clean_credits:
    tmdb_5000_credits[f] = tmdb_5000_credits[f].apply(literal_eval)

In [0]:
genome_tags = pd.read_csv('/content/drive/My Drive/genome-tags.csv')
genome_scores = pd.read_csv('/content/drive/My Drive/genome-scores.csv')
links = pd.read_csv('/content/drive/My Drive/links.csv')
movies = pd.read_csv('/content/drive/My Drive/movies.csv')
ratings = pd.read_csv('/content/drive/My Drive/ratings.csv')
tags = pd.read_csv('/content/drive/My Drive/tags.csv')

In [0]:
movies_metadata.head()

In [0]:
movies_metadata = pd.read_csv('/content/drive/My Drive/movies_metadata.csv.zip')
keywords = pd.read_csv('/content/drive/My Drive/keywords.csv.zip')
credits = pd.read_csv('/content/drive/My Drive/credits.csv.zip')

  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
movies_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [0]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [0]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


## Dataframe Preparation

In [0]:
#merge two df
tmdb_5000 = pd.merge(tmdb_5000_movies,tmdb_5000_credits,left_on='id',right_on='movie_id',suffixes=('', '_y'))
#drop features we don't need 
to_drop = ['budget','homepage','original_language','original_title','status','title_y','id']
tmdb_5000.drop(to_drop,axis=1, inplace=True)

## Feature Engineering

### Functions

In [0]:
def get_top_n(x,n):
    """Returns the list top n elements or entire list; whichever is more.
    x: dataframe series
    n: top_n
    return: updated dataframe series
    """
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than n elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > n:
            names = names[:n]
        return names

    #Return empty list in case of missing/malformed data
    return []

# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [0]:
# top three for certain features
features_3 = ['cast', 'genres','production_companies','production_countries','spoken_languages']

for feature in features_3:
    tmdb_5000[feature] = tmdb_5000[feature].apply(get_top_n,args = (3,))

# top five for certain features
features_5 = ['keywords']
for feature in features_5:
    tmdb_5000[feature] = tmdb_5000[feature].apply(get_top_n,args = (5,))

In [0]:
tmdb_5000['director'] = tmdb_5000['crew'].apply(get_director)
tmdb_5000.drop('crew',axis = 1,inplace = True)

In [0]:
# Apply clean_data function to features.
features = ['cast', 'genres','production_companies','production_countries','spoken_languages','keywords','director']

for feature in features:
    tmdb_5000[feature] =tmdb_5000[feature].apply(clean_data)

In [0]:
# keep only release year for each movie
tmdb_5000['release_date'] = pd.to_datetime(tmdb_5000['release_date'])
tmdb_5000['release_year'] = tmdb_5000['release_date'].dt.year
tmdb_5000['release_year'] = tmdb_5000['release_year'].astype('Int64')
tmdb_5000.drop('release_date',axis = 1, inplace = True)

In [0]:
tmdb_5000.head()

Unnamed: 0,genres,keywords,overview,popularity,production_companies,production_countries,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,movie_id,cast,director,release_year
0,"[action, adventure, fantasy]","[cultureclash, future, spacewar, spacecolony, ...","In the 22nd century, a paraplegic Marine is di...",150.437577,"[ingeniousfilmpartners, twentiethcenturyfoxfil...","[unitedstatesofamerica, unitedkingdom]",2787965087,162.0,"[english, español]",Enter the World of Pandora.,Avatar,7.2,11800,19995,"[samworthington, zoesaldana, sigourneyweaver]",jamescameron,2009
1,"[adventure, fantasy, action]","[ocean, drugabuse, exoticisland, eastindiatrad...","Captain Barbossa, long believed to be dead, ha...",139.082615,"[waltdisneypictures, jerrybruckheimerfilms, se...",[unitedstatesofamerica],961000000,169.0,[english],"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[johnnydepp, orlandobloom, keiraknightley]",goreverbinski,2007
2,"[action, adventure, crime]","[spy, basedonnovel, secretagent, sequel, mi6]",A cryptic message from Bond’s past sends him o...,107.376788,"[columbiapictures, danjaq, b24]","[unitedkingdom, unitedstatesofamerica]",880674609,148.0,"[français, english, español]",A Plan No One Escapes,Spectre,6.3,4466,206647,"[danielcraig, christophwaltz, léaseydoux]",sammendes,2015
3,"[action, crime, drama]","[dccomics, crimefighter, terrorist, secretiden...",Following the death of District Attorney Harve...,112.31295,"[legendarypictures, warnerbros., dcentertainment]",[unitedstatesofamerica],1084939099,165.0,[english],The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[christianbale, michaelcaine, garyoldman]",christophernolan,2012
4,"[action, adventure, sciencefiction]","[basedonnovel, mars, medallion, spacetravel, p...","John Carter is a war-weary, former military ca...",43.926995,[waltdisneypictures],[unitedstatesofamerica],284139100,132.0,[english],"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[taylorkitsch, lynncollins, samanthamorton]",andrewstanton,2012


## Content-based Recommendation

In [0]:
def metadata(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres']) +' '+ ' '.join('production_companies') +' '+ ' '.join('production_countries')+' '+' '.join('spoken_languages') 

# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim,df):
    # Get the index of the movie that matches the title
    #Construct a reverse map of indices and movie titles
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()
    idx = indices[title]

    idx_mod = np.mod(idx,10000)
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx_mod]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [0]:
tmdb_5000['metadata'] = tmdb_5000.apply(metadata, axis=1)

In [0]:
tmdb_5000['metadata']

0       cultureclash future spacewar spacecolony socie...
1       ocean drugabuse exoticisland eastindiatradingc...
2       spy basedonnovel secretagent sequel mi6 daniel...
3       dccomics crimefighter terrorist secretidentity...
4       basedonnovel mars medallion spacetravel prince...
                              ...                        
4798    unitedstates–mexicobarrier legs arms paperknif...
4799     edwardburns kerrybishé marshadietlein edwardb...
4800    date loveatfirstsight narration investigation ...
4801     danielhenney elizacoupe billpaxton danielhsia...
4802    obsession camcorder crush dreamgirl drewbarrym...
Name: metadata, Length: 4803, dtype: object

In [0]:
#create the count matrix
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(tmdb_5000['metadata'])

In [0]:
# Compute the Cosine Similarity matrix based on the count_matrix
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [0]:
tmdb_5000[tmdb_5000['title'] =='Sherlock Holmes']

Unnamed: 0,genres,keywords,overview,popularity,production_companies,production_countries,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,movie_id,cast,director,release_year,metadata
356,"[action, adventure, crime]","[detective, scotlandyard, coffin, blackmagic, ...","Eccentric consulting detective, Sherlock Holme...",57.834787,"[villageroadshowpictures, silverpictures, warn...","[germany, unitedkingdom, unitedstatesofamerica]",524028679,128.0,"[english, français]",Nothing escapes him.,Sherlock Holmes,7.0,5766,10528,"[robertdowneyjr., judelaw, rachelmcadams]",guyritchie,2009,detective scotlandyard coffin blackmagic arres...


In [0]:
get_recommendations('Sherlock Holmes',cosine_sim2,tmdb_5000)

205     Sherlock Holmes: A Game of Shadows
4638              Amidst the Devil's Wings
2156                            Nancy Drew
2625                   Kiss Kiss Bang Bang
2360                               Getaway
4068                             Sharkskin
4118                Hum To Mohabbat Karega
4314                             Crowsnest
4458                   Harrison Montgomery
4504               Light from the Darkroom
Name: title, dtype: object

# Large movie dataset

## Dataset Preparation

In [0]:
#customize literal_eval function
def literal_return(val):
    try:
        return literal_eval(val)
    except (ValueError, SyntaxError) as e:
        return val

In [0]:
movies_metadata = pd.read_csv('/content/drive/My Drive/movies_metadata.csv.zip')
keywords = pd.read_csv('/content/drive/My Drive/keywords.csv.zip')
credits = pd.read_csv('/content/drive/My Drive/credits.csv.zip')

f_clean_movies_meta = ['genres','production_companies','production_countries','spoken_languages']
f_clean_credits = ['cast','crew']

for f in f_clean_movies_meta:
    movies_metadata[f] = movies_metadata[f].apply(literal_return)

for f in f_clean_credits:
    credits[f] = credits[f].apply(literal_return)

keywords['keywords'] = keywords['keywords'].apply(literal_return)

  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
# top three for certain features
features_3 = ['genres','production_companies','production_countries','spoken_languages']
for feature in features_3:
    movies_metadata[feature] = movies_metadata[feature].apply(get_top_n,args = (3,))
movies_metadata['id'] = movies_metadata['id'].apply(literal_return)

credits['cast'] = credits['cast'].apply(get_top_n,args = (3,))
credits['director'] = credits['crew'].apply(get_director)
credits.drop('crew',axis = 1,inplace = True)

# top five for certain features
keywords['keywords'] = keywords['keywords'].apply(get_top_n,args = (5,))

In [0]:
#merge dataframes
movie = pd.merge(movies_metadata,credits, on='id',suffixes= ('','_y'))
movie = pd.merge(movie, keywords, on ='id',suffixes= ('','_y'))

#drop features we don't need 
to_drop = ['belongs_to_collection','budget','homepage','original_language','original_title','poster_path','status']
movie.drop(to_drop,axis=1, inplace=True)

In [0]:
# Apply clean_data function to features.
features = ['cast', 'genres','production_companies','production_countries','spoken_languages','director']

for feature in features:
    movie[feature] =movie[feature].apply(clean_data)

In [0]:
# keep only release year for each movie
movie['release_date'] = pd.to_datetime(movie['release_date'])
movie['release_year'] = movie['release_date'].dt.year
movie['release_year'] = movie['release_year'].astype('Int64')
movie.drop('release_date',axis = 1, inplace = True)

In [0]:
movie.head()

Unnamed: 0,adult,genres,id,imdb_id,overview,popularity,production_companies,production_countries,revenue,runtime,spoken_languages,tagline,title,video,vote_average,vote_count,cast,director,keywords,release_year
0,False,"[animation, comedy, family]",862,tt0114709,"Led by Woody, Andy's toys live happily in his ...",21.9469,[pixaranimationstudios],[unitedstatesofamerica],373554033.0,81.0,[english],,Toy Story,False,7.7,5415.0,"[tomhanks, timallen, donrickles]",johnlasseter,"[jealousy, toy, boy, friendship, friends]",1995
1,False,"[adventure, fantasy, family]",8844,tt0113497,When siblings Judy and Peter discover an encha...,17.0155,"[tristarpictures, teitlerfilm, interscopecommu...",[unitedstatesofamerica],262797249.0,104.0,"[english, français]",Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[robinwilliams, jonathanhyde, kirstendunst]",joejohnston,"[board game, disappearance, based on children'...",1995
2,False,"[romance, comedy]",15602,tt0113228,A family wedding reignites the ancient feud be...,11.7129,"[warnerbros., lancastergate]",[unitedstatesofamerica],0.0,101.0,[english],Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[waltermatthau, jacklemmon, ann-margret]",howarddeutch,"[fishing, best friend, duringcreditsstinger, o...",1995
3,False,"[comedy, drama, romance]",31357,tt0114885,"Cheated on, mistreated and stepped on, the wom...",3.85949,[twentiethcenturyfoxfilmcorporation],[unitedstatesofamerica],81452156.0,127.0,[english],Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[whitneyhouston, angelabassett, lorettadevine]",forestwhitaker,"[based on novel, interracial relationship, sin...",1995
4,False,[comedy],11862,tt0113041,Just when George Banks has recovered from his ...,8.38752,"[sandollarproductions, touchstonepictures]",[unitedstatesofamerica],76578911.0,106.0,[english],Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[stevemartin, dianekeaton, martinshort]",charlesshyer,"[baby, midlife crisis, confidence, aging, daug...",1995


In [0]:
movie['metadata'] = movie.apply(metadata, axis=1)

In [0]:
movie.to_csv('/content/drive/My Drive/movie_dataset.csv.gz',index=False,compression='gzip')

NameError: ignored

## Content-based Recommendation

### Cosine similarity

In [0]:
def metadata(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres']) +' '+ ' '.join('production_companies') +' '+ ' '.join('production_countries')+' '+' '.join('spoken_languages') 

In [0]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title,df):
    # Get the index of the movie that matches the title
    #Construct a reverse map of indices and movie titles
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()
    idx = indices[title]
    
    idx_mod = idx % 10000
    idx_div = idx // 10000
    
    #create the count matrix
    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(df['metadata'])
    cosine_sim = cosine_similarity(count_matrix[idx_div*10000:idx_div*10000+10000],count_matrix)
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx_mod]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [0]:
movie = pd.read_csv('/content/drive/My Drive/movie_dataset.csv.gz')

In [0]:
movie.shape

(46628, 21)

In [0]:
movie[movie['title'] == 'Sense and Sensibility']

Unnamed: 0,adult,genres,id,imdb_id,overview,popularity,production_companies,production_countries,revenue,runtime,spoken_languages,tagline,title,video,vote_average,vote_count,cast,director,keywords,release_year,metadata
16,False,"['drama', 'romance']",4584,tt0114388,"Rich Mr. Dashwood dies, leaving his second wif...",10.673167,"['columbiapicturescorporation', 'mirageenterpr...","['unitedkingdom', 'unitedstatesofamerica']",135000000.0,136.0,['english'],Lose your heart and come to your senses.,Sense and Sensibility,False,7.2,364.0,"['katewinslet', 'emmathompson', 'hughgrant']",anglee,"['bowling', 'based on novel', 'servant', 'coun...",1995.0,bowling based on novel servant country life ja...
28580,False,['drama'],315010,tt0847150,This is the acclaimed 2008 BBC adaptation of t...,1.724436,"['bbcworldwide', 'wgbhboston']",['unitedkingdom'],0.0,174.0,['english'],,Sense and Sensibility,False,7.9,19.0,"['hattiemorahan', 'charitywakefield', 'davidmo...",johnalexander,[],2008.0,hattiemorahan charitywakefield davidmorrissey...
41989,False,"['romance', 'drama']",391934,tt0089991,Two sisters of opposing temperaments find love...,0.416171,['britishbroadcastingcorporation(bbc)'],['unitedkingdom'],0.0,174.0,['english'],,Sense and Sensibility,False,0.0,0.0,"['irenerichards', 'traceychilds', 'annieleon']",rodneybennett,"['england', 'jane austen', 'love', 'heartbreak']",1981.0,england jane austen love heartbreak irenericha...
42208,False,[],243987,tt0254768,Adaptation of the Jane Austen novel.,0.010759,[],[],0.0,178.0,[],,Sense and Sensibility,False,0.0,0.0,"['joannadavid', 'ciaranmadden']",davidgiles,[],1971.0,joannadavid ciaranmadden davidgiles p r o d ...


In [0]:
get_recommendations('Avatar',movie)

26774                                             Avatar 2
4723                                             SpaceCamp
23948                              Guardians of the Galaxy
42655                         Rogue One: A Star Wars Story
7520                             Babylon 5: A Call to Arms
45385                                       Crash of Moons
8725                    Journey to the Far Side of the Sun
23939    Gunbuster vs Diebuster Aim for the Top! The GA...
26694                                             Stranded
13348                   War of the Worlds 2: The Next Wave
Name: title, dtype: object

## User-Item Collabarative Filtering

In [0]:
#genome_tags = pd.read_csv('/content/drive/My Drive/genome-tags.csv')
#genome_scores = pd.read_csv('/content/drive/My Drive/genome-scores.csv')
#links = pd.read_csv('/content/drive/My Drive/links.csv')
#movies = pd.read_csv('/content/drive/My Drive/movies.csv')
ratings = pd.read_csv('/content/drive/My Drive/ratings.csv')
#tags = pd.read_csv('/content/drive/My Drive/tags.csv')

In [0]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [0]:
user_index = pd.unique(ratings['userId'])
user_index = pd.DataFrame(user_index, columns=['userId'])
user_index['user_index'] = user_index.index
user_index.head()

Unnamed: 0,userId,user_index
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [0]:
movie_index = pd.unique(ratings['movieId'])
movie_index = pd.DataFrame(movie_index,columns = ['movieId'])
movie_index['movie_index'] = movie_index.index
movie_index.head()

Unnamed: 0,movieId,movie_index
0,307,0
1,481,1
2,1091,2
3,1257,3
4,1449,4


In [0]:
data = ratings.merge(user_index, on='userId')
data = data.merge(movie_index, on='movieId')

Mean = ratings.groupby(by="userId",as_index=False)['rating'].mean()
data = pd.merge(data,Mean,on='userId')
data['adg_rating']=data['rating_x']-data['rating_y']
data.head()

Unnamed: 0,userId,movieId,rating_x,timestamp,user_index,movie_index,rating_y,adg_rating
0,1,307,3.5,1256677221,0,0,3.3125,0.1875
1,1,481,3.5,1256677456,0,1,3.3125,0.1875
2,1,1091,1.5,1256677471,0,2,3.3125,-1.8125
3,1,1257,4.5,1256677460,0,3,3.3125,1.1875
4,1,1449,4.5,1256677264,0,4,3.3125,1.1875


In [0]:
utility_csr = csr_matrix((data.adg_rating, (data.user_index.values, data.movie_index.values)))

In [0]:
def get_jaccard(A,B):
  if A.shape[1] != B.shape[1]:
    return print("error, check dimension")
  else:
    upper = sum(np.minimum(A, B))
    lower = sum(np.maximum(A, B))
    return upper/lower

In [0]:
def get_top_J_similarity(user_idx, n, m_csr):
  
  length = m_csr.shape[0]
  output = np.zeros(length)
  user_np = m_csr[user_idx,:]

  for i in range(length):
    output[i] = jaccard_score(user_np, m_csr[i,:])
  
  #return index of user with top jaccard simialrity
  return output.argsort()[-n:][::-1]
    

  
  




In [0]:
get_top_J_similarity(1, 10, utility_csr)

TypeError: ignored

In [0]:
#try this function, works in my notebook
def get_top_J_similarity(user_idx, n, m_csr):
    length = m_csr.shape[0]
    output = np.zeros(length)
    
    
   
    user_np = utility_csr.getrow(user_idx).toarray().sum(axis=0)
  

    for i in range(length):
        output[i] = get_jaccard(user_np, utility_csr.getrow(i).toarray().sum(axis=0))
  
  #return index of user with top jaccard simialrity
    return output.argsort()[-n:][::-1]
    

### Cosine similarity

In [0]:
utility_csr.shape[0]

NameError: ignored

In [0]:
# Function that takes in movie title as input and outputs most similar movies
def get_similar_users(user_id,df):
    #Construct the csr matrix 
    utility_csr = csr_matrix((df.adg_rating, (df.user_index.values, df.movie_index.values)))
    print(utility_csr.shape[0])
    j_score = []
    #for i in range(283228):
    score = jaccard_score(utility_csr[user_id,:],utility_csr[2,:])
    print(score)
    # Get the pairwsie similarity scores of all users with that user
    sim_scores = list(enumerate(jaccard_score))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the similar user indices
    user_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar user_index
    return user_indices

In [0]:
def get_user_similar_movies( user1, user2,df ):
    movies = pd.read_csv('/content/drive/My Drive/movies.csv')
    common_movies = df[df.user_index == user1].merge(
    df[df.user_index == user2],on = "movieId" )
    common_movies = common_movies.merge(movies,on = 'movieId')

    return common_movies.loc[ : , ['rating_x_x','rating_x_y','title']]


In [0]:
get_similar_users(370,data)

283228


TypeError: ignored

In [0]:
get_user_similar_movies(370,33035,data)

Unnamed: 0,rating_x_x,rating_x_y,title
0,1.0,4.0,"Matrix, The (1999)"
1,5.0,5.0,Coco (2017)


## Deep Learning Model

In [0]:
def CFModel(input_p,input_q,n_users, m_items, k_factors):
    # P is the embedding layer that creates an User by latent factors matrix.
    # If the intput is a user_id, P returns the latent factor vector for that user.
    P = Embedding(n_users, k_factors, input_length=1)(input_p)
    P = Reshape((k_factors,))(P)

    # Q is the embedding layer that creates a Movie by latent factors matrix.
    # If the input is a movie_id, Q returns the latent factor vector for that movie.
    Q = Embedding(m_items, k_factors, input_length=1)(input_q)
    Q = Reshape((k_factors,))(Q)
    
    # The Merge layer takes the dot product of user and movie latent factor vectors to return the corresponding rating.   
    output  = dot([P, Q], axes =1)
    return output

In [0]:
# Reading ratings file
ratings = pd.read_csv('/content/drive/My Drive/ratings.csv')
max_userid = ratings['userId'].drop_duplicates().max()
max_movieid = ratings['movieId'].drop_duplicates().max()

In [0]:
ratings_downsample = ratings[ratings['userId'] % 80 ==0]

In [0]:
# Create training set
shuffled_ratings = ratings_downsample.sample(frac=1., random_state=1002)

# Shuffling users
Users = shuffled_ratings['userId'].values
print ('Users:', Users, ', shape =', Users.shape)

# Shuffling movies
Movies = shuffled_ratings['movieId'].values
print ('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print ('Ratings:', Ratings, ', shape =', Ratings.shape)

Users: [117680  22560 228320 ...  52480 265920 274800] , shape = (364602,)
Movies: [114662   3543   4718 ...  96079  45503     16] , shape = (364602,)
Ratings: [3.5 3.  3.  ... 3.  3.5 4. ] , shape = (364602,)


In [0]:
# Define constants
K_FACTORS = 100 # The number of dimensional embeddings for movies and users
TEST_USER = 6400 # A random test user (user_id = 1500)

In [0]:
# Define model
input_p = Input(shape = (1,)) 
input_q = Input(shape = (1,)) 
output = CFModel(input_p,input_q,max_userid, max_movieid, K_FACTORS)
model = keras.Model([input_p,input_q],output)
# Compile the model using MSE as the loss function and the AdaMax learning algorithm
model.compile(loss='mse', optimizer='adamax')







In [0]:
# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=5), 
             ModelCheckpoint('/content/drive/My Drive/weights.h5', save_best_only=True)]

# Use 30 epochs, 90% training data, 10% validation data 
history = model.fit([Users, Movies], Ratings, epochs=30, validation_split=.2, verbose=2, callbacks=callbacks)



Train on 291681 samples, validate on 72921 samples
Epoch 1/30




 - 397s - loss: 12.4773 - val_loss: 9.1565
Epoch 2/30
 - 396s - loss: 5.9907 - val_loss: 4.1702
Epoch 3/30
 - 396s - loss: 3.2491 - val_loss: 2.7741
Epoch 4/30
 - 397s - loss: 2.3030 - val_loss: 2.1740
Epoch 5/30
 - 396s - loss: 1.8415 - val_loss: 1.8475
Epoch 6/30
 - 396s - loss: 1.5699 - val_loss: 1.6420
Epoch 7/30
 - 396s - loss: 1.3877 - val_loss: 1.5018
Epoch 8/30
 - 396s - loss: 1.2558 - val_loss: 1.4008
Epoch 9/30
 - 396s - loss: 1.1547 - val_loss: 1.3254
Epoch 10/30
 - 396s - loss: 1.0744 - val_loss: 1.2672
Epoch 11/30
 - 395s - loss: 1.0079 - val_loss: 1.2218
Epoch 12/30
 - 396s - loss: 0.9514 - val_loss: 1.1866
Epoch 13/30
 - 395s - loss: 0.9020 - val_loss: 1.1580
Epoch 14/30
 - 396s - loss: 0.8579 - val_loss: 1.1339
Epoch 15/30
 - 396s - loss: 0.8179 - val_loss: 1.1130
Epoch 16/30
 - 396s - loss: 0.7804 - val_loss: 1.0966
Epoch 17/30
 - 396s - loss: 0.7462 - val_loss: 1.0822
Epoch 18/30
 - 396s - loss: 0.713

In [0]:
# The rate function to predict user's rating of unrated items
def rate(user_id, item_id):
    return .predict([np.array([user_id]), np.array([item_id])])[0][0]


In [0]:
model.predict([[1500],[362]])

array([[2.5149899]], dtype=float32)

In [0]:
user_ratings = ratings[ratings['userId'] == TEST_USER][['userId', 'movieId', 'rating']]
user_ratings['predicted_rating'] = user_ratings.apply(lambda x: model.predict([[TEST_USER], [x['movieId']]]).flatten().flatten(), axis=1)

In [0]:
user_ratings

Unnamed: 0,userId,movieId,rating,predicted_rating
151415,1500,60,0.5,[2.157028]
151416,1500,107,3.0,[2.4973726]
151417,1500,303,3.5,[2.3923533]
151418,1500,362,3.0,[2.5149899]
151419,1500,637,3.5,[2.09068]
151420,1500,1037,0.5,[2.0113392]
151421,1500,2826,4.0,[2.4777238]
151422,1500,2872,0.5,[2.354858]
151423,1500,3107,2.0,[2.3406978]
151424,1500,3174,5.0,[2.5547493]


## SVD





In [0]:
import pandas as pd
import numpy as np
# Reading ratings file
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating', 'timestamp'])

# Reading users file
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading movies file
movies = pd.read_csv('movie_dataset.csv')
movies = movies.rename(columns={"id": "movie_id", "genres": "genres","title":"title"})
movies = movies[['movie_id','genres','title']]

In [0]:
n_users = ratings.user_id.unique().shape[0]
n_movies = ratings.movie_id.unique().shape[0]

In [0]:
Ratings = ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)

In [0]:
R = Ratings.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

  """Entry point for launching an IPython kernel.


In [0]:
sparsity = round(1.0 - len(ratings) / float(n_users * n_movies), 3)

In [0]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(Ratings_demeaned, k = 50)

In [0]:
sigma = np.diag(sigma)

In [0]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [0]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()

movie_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,3913,3914,3915,3916,3917,3918,3919,3920,3921,3922,3923,3924,3925,3926,3927,3928,3929,3930,3931,3932,3933,3934,3935,3936,3937,3938,3939,3940,3941,3942,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.288861,0.143055,-0.19508,-0.018843,0.012232,-0.176604,-0.07412,0.141358,-0.059553,-0.19595,0.512867,-0.089172,0.310181,-0.002005,-0.052401,-0.189827,0.23836,0.006466,-0.099315,-0.069682,-0.321492,0.111577,0.034795,0.320576,-0.118217,-0.012647,0.065573,-0.098318,0.064081,-0.005914,0.091936,0.180563,-0.009566,2.641693,-0.012495,0.765179,0.019784,0.002917,0.053079,0.014856,...,0.01881,-0.018782,0.022249,0.227852,-0.067653,-0.046039,-0.023574,-0.019405,-0.005116,-0.032921,-0.008259,-0.019157,0.007527,-0.008687,-0.02563,-0.013563,0.01524,-0.044665,-0.009568,-0.043549,-0.003131,-0.008221,-0.005948,0.031885,-0.003424,-0.001159,-0.002124,-0.002827,0.010393,-0.001068,0.027807,0.00164,0.026395,-0.022024,-0.085415,0.403529,0.105579,0.031912,0.05045,0.08891
1,0.744716,0.169659,0.335418,0.000758,0.022475,1.35305,0.051426,0.071258,0.161601,1.567246,0.772656,0.046179,-0.054562,0.042344,0.04839,0.347313,1.074905,-0.099782,0.008163,0.250869,2.186638,0.018789,-0.002199,0.218934,0.824475,0.139274,-0.007135,0.053071,-0.156952,0.044739,-0.00296,0.453298,-0.007484,0.920325,0.016566,1.335129,-0.015066,-0.045602,0.034649,0.12201,...,-0.042363,-0.137822,-0.112071,0.380783,-0.036273,-0.016174,0.00292,-0.148021,-0.017614,-0.033474,0.086133,0.008153,-0.126819,0.109208,0.001798,0.151866,0.014118,0.032897,0.005764,0.042259,0.022404,0.00326,0.010556,0.137181,-0.042184,0.006759,-0.005789,0.00034,0.002024,0.016013,-0.056502,-0.013733,-0.01058,0.062576,-0.016248,0.15579,-0.418737,-0.101102,-0.054098,-0.140188
2,1.818824,0.456136,0.090978,-0.043037,-0.025694,-0.158617,-0.131778,0.098977,0.030551,0.73547,-0.023476,0.034796,0.065942,0.008661,0.110348,-0.002952,-0.122061,0.063974,0.061033,0.081799,0.329471,0.149579,0.095352,-0.161493,0.022545,-0.009284,-0.002677,-0.14271,0.012345,-0.085331,0.076139,-0.355795,-0.008579,1.046871,-0.088946,0.383583,-0.018144,-0.038618,0.113984,0.006942,...,0.007233,-0.047221,0.066474,-0.179455,0.097428,0.034113,0.008098,-0.024784,-0.012749,-0.007394,-0.01722,0.004719,0.113348,-0.074943,-0.145795,0.128619,0.112567,0.0455,-0.018027,-0.058946,-0.00277,-0.035276,-0.008085,0.132182,-0.017005,0.014383,0.006598,-0.006217,-0.000342,0.000518,0.040481,-0.005301,0.012832,0.029349,0.020866,0.121532,0.076205,0.012345,0.015148,-0.109956
3,0.408057,-0.07296,0.039642,0.089363,0.04195,0.237753,-0.049426,0.009467,0.045469,-0.11137,-0.375831,0.068658,0.011199,0.069699,-0.037529,-0.238788,0.060607,-0.043418,0.053152,0.078237,0.357185,-0.096005,-0.028243,-0.067169,0.246164,-0.020379,0.034461,-0.022225,-0.012327,0.009182,0.01473,0.215893,-0.019687,-0.293933,-0.011511,0.145326,-0.029213,0.030029,-0.045409,-0.030684,...,-0.015077,-0.030208,0.028357,-0.072643,-0.135727,-0.053318,-0.012962,-0.054465,0.00587,-0.018048,-0.006836,-0.008222,-0.027214,-0.071677,-0.094072,-0.010745,-0.103191,-0.031297,-0.02392,-0.015053,-0.017914,-0.029561,-0.024299,-0.057678,-0.11145,-0.015473,-0.007123,-0.007416,-0.011508,-0.010038,0.008571,-0.005425,-0.0085,-0.003417,-0.083982,0.094512,0.057557,-0.02605,0.014841,-0.034224
4,1.574272,0.021239,-0.0513,0.246884,-0.032406,1.552281,-0.19963,-0.01492,-0.060498,0.450512,-0.251178,0.012337,-0.084051,0.258937,0.01657,0.980536,1.267869,0.275619,-0.008139,-0.038832,1.849627,0.107649,-0.168424,0.386541,1.790343,0.192379,-0.054356,0.267566,1.027817,0.374665,-0.010445,1.94798,0.017468,2.784035,0.274397,1.422393,0.040553,0.022926,1.3458,0.104507,...,0.075475,0.330767,0.15047,-0.261636,0.085163,-0.014229,-0.029247,0.124172,0.092875,0.061895,0.034757,0.054386,0.047055,0.048403,0.082926,0.129035,-0.174646,0.102727,0.024732,0.04728,0.017818,0.041451,0.041595,-0.007138,-0.080448,0.018639,0.034068,0.026941,0.035905,0.024459,0.110151,0.04601,0.006934,-0.01594,-0.05008,-0.052539,0.507189,0.03383,0.125706,0.199244


In [0]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.user_id == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movie_id', right_on = 'movie_id').
                     sort_values(['rating'], ascending=False)
                 )

    #print 'User {0} has already rated {1} movies.'.format(userID, user_full.shape[0])
    #print 'Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations)
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movie_id'].isin(user_full['movie_id'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movie_id',
               right_on = 'movie_id').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

In [0]:
already_rated, predictions = recommend_movies(preds, 1310, movies, ratings, 20)

In [0]:
# Top 20 movies that User 1310 has rated 
already_rated.head(50).dropna()

Unnamed: 0,user_id,movie_id,rating,timestamp,genres,title
6,1310,2620,5,974781573,"['action', 'comedy', 'crime']",Armed and Dangerous
7,1310,3683,5,974781935,"['war', 'drama', 'history']",Flags of Our Fathers
12,1310,3101,4,974781573,"['action', 'comedy', 'crime']",I Love You to Death
20,1310,2000,4,974781892,"['adventure', 'drama', 'history']",Aguirre: The Wrath of God
18,1310,3526,4,974781892,['drama'],Frances
13,1310,3111,4,974782001,"['drama', 'music', 'romance']",A Star Is Born
16,1310,144,3,974781573,"['drama', 'fantasy', 'romance']",Wings of Desire
0,1310,2988,3,974781935,"['drama', 'romance']",The Harrad Experiment
14,1310,2313,2,974781839,"['comedy', 'drama', 'romance']",Prime
22,1310,1090,2,974781839,"['thriller', 'sciencefiction', 'mystery']",The Thirteenth Floor


In [0]:
ratings[ratings['user_id'] == 1310]

Unnamed: 0,user_id,movie_id,rating,timestamp
215928,1310,2988,3,974781935
215929,1310,1293,5,974781839
215930,1310,1295,2,974782001
215931,1310,1299,4,974781701
215932,1310,2243,4,974782001
215933,1310,2248,5,974781573
215934,1310,2620,5,974781573
215935,1310,3683,5,974781935
215936,1310,3685,4,974781935
215937,1310,1185,4,974781839


In [0]:
predictions

Unnamed: 0,movie_id,genres,title
13548,1961,"['comedy', 'horror']",My Name Is Bruce
11546,1246,['drama'],Rocky Balboa
5297,1957,"['drama', 'thriller']",Enough
2911,2020,['comedy'],The Bachelor
11581,1259,"['drama', 'romance']",Notes on a Scandal
6089,1956,"['mystery', 'drama', 'adventure']",Gerry
3579,541,"['crime', 'drama', 'romance']",The Man with the Golden Arm
12423,2312,"['adventure', 'fantasy', 'action']",In the Name of the King: A Dungeon Siege Tale
11455,1124,"['drama', 'mystery', 'thriller']",The Prestige
952,260,"['action', 'thriller', 'mystery']",The 39 Steps
