In [314]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Data formats
"ratings.dat" file format - UserID::MovieID::Rating::Timestamp
- UserIDs range between 1 and 6040 
- MovieIDs range between 1 and 3952
- Ratings are made on a 5-star scale (whole-star ratings only)
- Timestamp is represented in seconds since the epoch as returned by time(2)
- Each user has at least 20 ratings


"movies.dat" format - MovieID::Title::Genres
- Genres are pipe-separated and are selected from the following genres: [Action, Adventure, Animation, Children's, Comedy, Crime, Documentary, Drama, Fantasy, Film-Noir, Horror, Musical, Mystery, Romance, Sci-Fi, Thriller, War, Western]
- Titles are identical to titles provided by the IMDB (including
year of release)
- Some MovieIDs do not correspond to a movie due to accidental duplicate
entries and/or test entries
- Movies are mostly entered by hand, so errors and inconsistencies may exist


"users.dat" file format - UserID::Gender::Age::Occupation::Zip-code
- Gender is denoted by a "M" for male and "F" for female
- Age is chosen from the following ranges: {1:  "Under 18", 18:  "18-24", 25:  "25-34", 35:  "35-44", 45:  "45-49", 50:  "50-55", 56:  "56+"}

- Occupation is chosen from the following choices: {0:  "other" or "not specified", 1:  "academic/educator", 2:  "artist", 3:  "clerical/admin", 4:  "college/grad student", 5:  "customer service", 6:  "doctor/health care", 7:  "executive/managerial", 8:  "farmer", 9:  "homemaker", 10:  "K-12 student", 11:  "lawyer", 12:  "programmer", 13:  "retired", 14:  "sales/marketing", 15:  "scientist", 16:  "self-employed", 17:  "technician/engineer", 18:  "tradesman/craftsman", 19:  "unemployed", 20:  "writer"}

In [20]:
ratings = pd.read_csv('./datasets/movielens-1m/ratings.dat', 
                 delimiter='::', 
                 encoding='latin1', 
                 engine = 'python', 
                 header = None,
                 names = ['user_id', 'movie_id', 'rating', 'timestamp']
)
movies = pd.read_csv('./datasets/movielens-1m/movies.dat', 
                 delimiter='::', 
                 encoding = 'latin1',
                 header = None,
                 engine = 'python',
                 names = ['movie_id', 'title', 'genres']
)
users = pd.read_csv('./datasets/movielens-1m/users.dat', 
                 delimiter='::', 
                 encoding='latin1', 
                 engine = 'python', 
                 header = None,
                 names = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
)

In [49]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [50]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [51]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


# Model Building (content-based recommendation)

In [200]:
content_rec = movies.copy()

# clean genres column
content_rec['genres'] = content_rec['genres'].apply(
    lambda genres: [genre.lower().replace("'", '').replace('-', '_') 
                    for genre in genres.split('|')]
)

# clean title column (separate title and year)
content_rec[['title', 'year']] = content_rec['title'].str.extract('(.+) \((\d{4})\)', expand = True)

# check if cleaned properly
content_rec[content_rec['title'].isnull() | content_rec['year'].isnull() | content_rec['genres'].isnull()]

Unnamed: 0,movie_id,title,genres,year
988,1001,,[comedy],


In [201]:
# fix title of movie_id == 10001
pattern = r'^(.*?) \(([^)]+)\)\((\d{4})\)$'
temp_title_and_year = movies[movies['movie_id'] == 1001]['title'].str.extract(pattern)

content_rec.at[988, 'title'] = temp_title_and_year[0].values[0]
content_rec.at[988, 'year'] = temp_title_and_year[2].values[0]

content_rec[content_rec['movie_id'] == 1001]

Unnamed: 0,movie_id,title,genres,year
988,1001,"Associate, The",[comedy],1982


In [202]:
# fix some title strings
content_rec[content_rec.title.str.contains(',')]

Unnamed: 0,movie_id,title,genres,year
10,11,"American President, The","[comedy, drama, romance]",1995
28,29,"City of Lost Children, The","[adventure, sci_fi]",1995
39,40,"Cry, the Beloved Country",[drama],1995
49,50,"Usual Suspects, The","[crime, thriller]",1995
53,54,"Big Green, The","[childrens, comedy]",1995
...,...,...,...,...
3866,3936,"Phantom of the Opera, The","[drama, thriller]",1943
3868,3938,"Slumber Party Massacre, The",[horror],1982
3869,3939,"Slumber Party Massacre II, The",[horror],1987
3870,3940,"Slumber Party Massacre III, The",[horror],1990


In [203]:
def fix_comma(movie_title):
    title_split = movie_title.split(', ')
    return title_split[-1] + ' ' + ' '.join(title_split[:-1])
 
condition = content_rec.title.str.contains(',')
content_rec.loc[condition, 'title'] = content_rec.loc[condition, 'title'].apply(fix_comma)
content_rec[content_rec.title.str.contains(',')]

Unnamed: 0,movie_id,title,genres,year
1006,1019,"20,000 Leagues Under the Sea","[adventure, childrens, fantasy, sci_fi]",1954
1962,2031,"$1,000,000 Duck","[childrens, comedy]",1971


In [95]:
import requests

headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIxNWRkMTQ2ZWFlYTA2NmJiMWFhZjJhOWYxZjQ2Y2I4MiIsInN1YiI6IjY1YmYwNmYyNDM5OTliMDE4NGM3MDNhZiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.FgcvH65nHD5eOftKjOZJUI8PhkoCDtnvsf35KxN2RPQ"
}

In [86]:
# !!!!!!!!!!!!!!!! DON'T RUN THIS IN COLAB !!!!!!!!!!!!!!!!!
def fetch_plot(row):
    movie_title = row['title']
    movie_year = row['year']
    
    search_url = "https://api.themoviedb.org/3/search/movie"
    params = {
        'query': movie_title,
        'year': movie_year,
    }
    
    res = requests.get(search_url, headers=headers, params=params)
    
    if res.status_code == 200:
        search_results = res.json()
        if 0 < search_results['total_results']:
            return search_results['results'][0]['overview']
        else:
            return "ERROR: movie not found"
    else:
        return "ERROR: cannot call TMDB API"
            
plots = content_rec.apply(fetch_plot, axis = 1)
plots.to_pickle('./datasets/movie_plot_inorder.pkl')

In [204]:
# fuck above code takes waaaaay too long to run, so save it into a file
# don't run that shit

plots = pd.read_pickle('./datasets/movie_plot_inorder.pkl')

content_rec['plot'] = plots
content_rec

Unnamed: 0,movie_id,title,genres,year,plot
0,1,Toy Story,"[animation, childrens, comedy]",1995,"Led by Woody, Andy's toys live happily in his ..."
1,2,Jumanji,"[adventure, childrens, fantasy]",1995,When siblings Judy and Peter discover an encha...
2,3,Grumpier Old Men,"[comedy, romance]",1995,A family wedding reignites the ancient feud be...
3,4,Waiting to Exhale,"[comedy, drama]",1995,"Cheated on, mistreated and stepped on, the wom..."
4,5,Father of the Bride Part II,[comedy],1995,Just when George Banks has recovered from his ...
...,...,...,...,...,...
3878,3948,Meet the Parents,[comedy],2000,"Greg Focker is ready to marry his girlfriend, ..."
3879,3949,Requiem for a Dream,[drama],2000,The drug-induced utopias of four Coney Island ...
3880,3950,Tigerland,[drama],2000,A group of recruits go through Advanced Infant...
3881,3951,Two Family House,[drama],2000,Buddy Visalo (Michael Rispoli) is a factory wo...


In [205]:
# check if there are any rows with missing plots
condition = (content_rec['plot'] == '') | (content_rec['plot'].str.contains('ERROR'))
content_rec[condition]

Unnamed: 0,movie_id,title,genres,year,plot
29,30,Shanghai Triad (Yao a yao yao dao waipo qiao),[drama],1995,ERROR: movie not found
82,83,Once Upon a Time... When We Were Colored,[drama],1995,ERROR: movie not found
119,121,The Boys of St. Vincent,[drama],1993,ERROR: movie not found
125,127,The (Saimt el Qusur) Silence of the Palace,[drama],1994,ERROR: movie not found
126,128,Jupiter's Wife,[documentary],1994,ERROR: movie not found
...,...,...,...,...,...
3820,3890,Back Stage,[documentary],2000,ERROR: movie not found
3832,3902,Goya in Bordeaux (Goya en Bodeos),[drama],1999,ERROR: movie not found
3834,3904,An Uninvited Guest,[drama],2000,ERROR: movie not found
3837,3907,The Prince of Central Park,[drama],1999,ERROR: movie not found


In [206]:
# if we don't have plot, replace with empty string
content_rec.loc[condition, 'plot'] = content_rec.loc[condition, 'plot'].apply(lambda plot_str: '')

content_rec[content_rec['plot'] == '']

Unnamed: 0,movie_id,title,genres,year,plot
29,30,Shanghai Triad (Yao a yao yao dao waipo qiao),[drama],1995,
82,83,Once Upon a Time... When We Were Colored,[drama],1995,
119,121,The Boys of St. Vincent,[drama],1993,
125,127,The (Saimt el Qusur) Silence of the Palace,[drama],1994,
126,128,Jupiter's Wife,[documentary],1994,
...,...,...,...,...,...
3820,3890,Back Stage,[documentary],2000,
3832,3902,Goya in Bordeaux (Goya en Bodeos),[drama],1999,
3834,3904,An Uninvited Guest,[drama],2000,
3837,3907,The Prince of Central Park,[drama],1999,


In [217]:
# NLP text processing; create bag of words to feed into tf-idf vectorization
# join [title, genres, year, plot] into one string, multiplied by their respective weight
content_rec['bag_of_words'] = (content_rec['title'] + ' ' 
    + content_rec['genres'].apply(lambda genres: ' '.join(genres)) + ' ' 
    + content_rec['year'] + ' ' 
    + content_rec['plot'] + ' '
)

content_rec

Unnamed: 0,movie_id,title,genres,year,plot,bag_of_words
0,1,Toy Story,"[animation, childrens, comedy]",1995,"Led by Woody, Andy's toys live happily in his ...",Toy Story animation childrens comedy 1995 Led ...
1,2,Jumanji,"[adventure, childrens, fantasy]",1995,When siblings Judy and Peter discover an encha...,Jumanji adventure childrens fantasy 1995 When ...
2,3,Grumpier Old Men,"[comedy, romance]",1995,A family wedding reignites the ancient feud be...,Grumpier Old Men comedy romance 1995 A family ...
3,4,Waiting to Exhale,"[comedy, drama]",1995,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale comedy drama 1995 Cheated on...
4,5,Father of the Bride Part II,[comedy],1995,Just when George Banks has recovered from his ...,Father of the Bride Part II comedy 1995 Just w...
...,...,...,...,...,...,...
3878,3948,Meet the Parents,[comedy],2000,"Greg Focker is ready to marry his girlfriend, ...",Meet the Parents comedy 2000 Greg Focker is re...
3879,3949,Requiem for a Dream,[drama],2000,The drug-induced utopias of four Coney Island ...,Requiem for a Dream drama 2000 The drug-induce...
3880,3950,Tigerland,[drama],2000,A group of recruits go through Advanced Infant...,Tigerland drama 2000 A group of recruits go th...
3881,3951,Two Family House,[drama],2000,Buddy Visalo (Michael Rispoli) is a factory wo...,Two Family House drama 2000 Buddy Visalo (Mich...


In [230]:
# Apply word transformers (ex: Word2Vec) or sentence transformers (ex: BERT) to vectorize as NLP
    # in this case we should use word transformers b/c
    # there is no 'text context' (context independent) in bag of words
# Try getting cosine similarity or use tf-idf

# Define the weights for each column
    # weights must be in whole numbers since TF-IDF count word frequencies,
    # for each column it will be multiplied (repeated) by 'weights'
column_weights = {
    'title': 2,
    'genres': 4,
    'year': 1,
    'plot': 8,
}

content_rec['bag_of_words'] = (
    content_rec['title'].apply(lambda title: (title + ' ') * column_weights['title']) +
    content_rec['genres'].apply(lambda genres: (' '.join(genres) + ' ') * column_weights['genres']) +
    content_rec['year'].apply(lambda title: (title + ' ') * column_weights['year']) +
    content_rec['year'].apply(lambda title: (title + ' ') * column_weights['year'])
)
content_rec

Unnamed: 0,movie_id,title,genres,year,plot,bag_of_words
0,1,Toy Story,"[animation, childrens, comedy]",1995,"Led by Woody, Andy's toys live happily in his ...",Toy Story Toy Story animation childrens comedy...
1,2,Jumanji,"[adventure, childrens, fantasy]",1995,When siblings Judy and Peter discover an encha...,Jumanji Jumanji adventure childrens fantasy ad...
2,3,Grumpier Old Men,"[comedy, romance]",1995,A family wedding reignites the ancient feud be...,Grumpier Old Men Grumpier Old Men comedy roman...
3,4,Waiting to Exhale,"[comedy, drama]",1995,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale Waiting to Exhale comedy dra...
4,5,Father of the Bride Part II,[comedy],1995,Just when George Banks has recovered from his ...,Father of the Bride Part II Father of the Brid...
...,...,...,...,...,...,...
3878,3948,Meet the Parents,[comedy],2000,"Greg Focker is ready to marry his girlfriend, ...",Meet the Parents Meet the Parents comedy comed...
3879,3949,Requiem for a Dream,[drama],2000,The drug-induced utopias of four Coney Island ...,Requiem for a Dream Requiem for a Dream drama ...
3880,3950,Tigerland,[drama],2000,A group of recruits go through Advanced Infant...,Tigerland Tigerland drama drama drama drama 20...
3881,3951,Two Family House,[drama],2000,Buddy Visalo (Michael Rispoli) is a factory wo...,Two Family House Two Family House drama drama ...


In [251]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Fit a TF-IDF vectorizer on bag of words
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(content_rec['bag_of_words'])

# Train the KNN model for content-based filtering:
# Compute similarity matrix from vectorized [title, year, genres, plot]: 
    # there are several methods of calculating distance (similarity)
    # 1. Euclidean distance
    # 2. Manhattan distance
    # 3. Jaccard distance
    # 4. Cosine distance (using this one)
content_model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
content_model_knn.fit(tfidf_matrix)

In [282]:
content_rec.index.values == idx

array([   0,    1,    2, ..., 3880, 3881, 3882])

In [288]:
# Fuzzy string matching to fix wrong movie input by user
from pprint import pprint
from fuzzywuzzy import fuzz

def get_movie_from_idx(idx):
    # get movie details from idx
    title = content_rec.loc[idx, 'title']
    year = content_rec.loc[idx, 'year']

    return title, year
    
def get_idx_from_title(title):
    # convert title to idx
    return content_rec[content_rec['title'] == title].index.values[0]

# Fix wrong input by user
def matching_score(a, b):
    # calculate the Levenshtein distance to find closest title
        # if exactly same score == 100
    return fuzz.ratio(a, b) 
    
def find_closest_title(title):
    leven_scores = list(enumerate(content_rec['title'].apply(matching_score, b = title)))
    sorted_lev_scores = sorted(leven_scores, key = lambda x: x[1], reverse = True)

    closest_idx = sorted_lev_scores[0][0]
    closest_movie = content_rec.loc[closest_idx]
    distance_score = sorted_lev_scores[0][1]

    closest_movie_id = closest_movie['movie_id']
    closest_movie_title = closest_movie['title']
        
    return (closest_movie_id, closest_movie_title, distance_score)

In [306]:
def get_movie_from_idx(idx):
    # get movie details from idx
    title = content_rec.loc[idx, 'title']
    year = content_rec.loc[idx, 'year']
    genres = content_rec.loc[idx, 'genres']
    plot = content_rec.loc[idx, 'plot']

    return title, year, genres, plot

# get user id input
    # get most recent top n rated movies from user id
    # run original rec algorithm for each movie
def get_content_based_recommendation(movie_name, num_recommendations = 5):
    # fix movie name if user inputs wrong name
    closest_id, closest_name, closest_distance = find_closest_title(movie_name)
    if closest_distance != 100:
        print(f"Did you mean '{closest_name}' instead of '{movie_name}'?")
        movie_name = closest_name

    movie_idx = get_idx_from_title(movie_name)
    distances_cb, indices_cb = knn_cb.kneighbors(
        tfidf_matrix[movie_idx], 
        n_neighbors = num_recommendations + 1
    )

    # Show recommended movies
    print(f'Recommended movies for {movie_name}:')
    for distance, idx in zip(distances_cb.flatten(), indices_cb.flatten()):
        title, year, genres, _ = get_movie_from_idx(idx)
        genres = ', '.join(genres)

        print(f'|----> {title} ({year})')
        print(f'|        Genres: {genres}')

get_content_based_recommendation('nigerland')

Did you mean 'Tigerland' instead of 'nigerland'?
Recommended movies for Tigerland:
|----> Tigerland (2000)
|        Genres: drama
|----> Hamlet (2000)
|        Genres: drama
|----> Groove (2000)
|        Genres: drama
|----> Urbania (2000)
|        Genres: drama
|----> Girlfight (2000)
|        Genres: drama
|----> Beautiful (2000)
|        Genres: comedy, drama


In [311]:
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [None]:
# export models
import joblib
joblib.dump(model_knn, 'content_based_model.pkl')

# Model Building (collaborative filtering; user-user filtering; KNN algorithm)
- input user_id & desired number of similar users, and will recommend movies rated by similar users  

In [317]:
# merge datasets
merged = pd.merge(ratings, users, how = 'inner', on = 'user_id')
merged = pd.merge(merged, movies, how = 'inner', on = 'movie_id')
merged.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip_code,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [318]:
print(f"Unique number of users in the dataset: {merged['user_id'].nunique()}")
print(f"Unique number of movies in the dataset: {merged['movie_id'].nunique()}")
    # why is number of movies smaller?? what happened?
    # are some movies not rated at all?

Unique number of users in the dataset: 6040
Unique number of movies in the dataset: 3706


In [319]:
movies.shape

(3883, 3)

In [320]:
movies[~movies['movie_id'].isin(merged['movie_id'].unique())]

Unnamed: 0,movie_id,title,genres
50,51,Guardian Angel (1994),Action|Drama|Thriller
107,109,Headless Body in Topless Bar (1995),Comedy
113,115,Happiness Is in the Field (1995),Comedy
141,143,Gospa (1995),Drama
281,284,New York Cop (1996),Action|Crime
...,...,...,...
3581,3650,Anguish (Angustia) (1986),Horror
3681,3750,Boricua's Bond (2000),Drama
3759,3829,Mad About Mambo (2000),Comedy|Romance
3786,3856,Autumn Heart (1999),Drama


In [321]:
ratings[ratings['movie_id'] == 51]
    # yes, some movies aren't rated
# hence there are some movies missing in merged dataset

Unnamed: 0,user_id,movie_id,rating,timestamp


In [322]:
from scipy.sparse import csr_matrix

refined = merged.groupby(['user_id', 'movie_id'], as_index = False).agg({'rating': 'mean'})

# get user-item matrix (values = ratings)
user_to_movie = refined.pivot(index = 'user_id', columns='movie_id', values='rating').fillna(0)
user_to_movie_sparse = csr_matrix(user_to_movie.values)

In [None]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn.fit(user_to_movie_sparse)

In [None]:
import numpy as np
from pprint import pprint

def get_similar_users(user, n = 5):
    knn_input = np.asarray([user_to_movie.values[user-1]])
        # get user's row from user-item matrix
    distances, indices = knn.kneighbors(knn_input, n_neighbors = n+1)

    print(" ")
    print("Top", n, "users who are very much similar to the User ", user, "are: ")
    for i in range(1,len(distances[0])):
        print(i,". User:", indices[0][i]+1, "separated by distance of",distances[0][i])
        
    return indices.flatten()[1:] + 1, distances.flatten()[1:]

def recommend_n_movies(user_id, n_similar_users, n_movie_recs):
    print(f'Movies seen by User {user_id}:')
    pprint(list(merged[merged['user_id'] == user_id]['title']))

    # get n_similar_users and their cosine similarity distances
    similar_users, distance_list = get_similar_users(user_id, n_similar_users)

    # define weights to ratings by each similar user based on distance
        # -> eliminates chance of decision manipulation by users relatively far from input user
    weightage_list = distance_list / np.sum(distance_list)
    
    # get ratings of all movies from similar users
    mov_ratings_sim_users = user_to_movie.loc[similar_users]
    movies_id_list = user_to_movie.columns
    
    # transform to make it compatible for matrix operations
    weightage_list = weightage_list[:, np.newaxis] + np.zeros(len(movies_id_list))
    
    # apply the weights to user ratings
    new_rating_matrix = weightage_list * mov_ratings_sim_users
    
    # get the mean ratings
    mean_rating_list = new_rating_matrix.sum(axis = 0)

    # get n movie indices with the highest mean
        # FIX: remove movies already seen by user
    highest_score_idx = np.argsort(mean_rating_list)[::-1][:n_movie_recs]
    rec_movie_ids = new_rating_matrix.iloc[:, highest_score_idx].columns
    
    print(" ")
    print(f"Top {n_movie_recs} movie recommendations to user {user_id}:")
    rec_movies = []
    for rec_movie_id in rec_movie_ids:
        movie_title = movies[movies['movie_id'] == rec_movie_id]['title'].values[0]
        rec_movies.append(movie_title)
        print(f"\t{movie_title}")

    return rec_movies

In [None]:
user_id = 596
similar_users = 10
recommended_movies = 10

rec_movies = recommend_n_movies(
    user_id,
    similar_users,
    recommended_movies
)
# FIX: recommends movies already seen by user 

In [None]:
for movie in merged[merged['user_id'] == user_id]['title']:
    if movie in rec_movies:
        print(movie)

# Model Building (item-based; KNN algorithm)
- input movie name & number of recommendations

In [327]:
# create item-user matrix
movie_to_user = refined.pivot(
    index='movie_id',
    columns='user_id',
    values='rating'
).fillna(3)

movie_to_user.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,3.0,3.0,3.0,4.0,3.0,4.0,5.0,5.0,...,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0
2,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,5.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,1.0,3.0,3.0,3.0,3.0,3.0
4,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0
5,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,1.0,3.0,3.0,3.0,3.0,3.0


In [328]:
# train model
movie_to_user_sparse = csr_matrix(movie_to_user.values)

knn_movie_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn_movie_model.fit(movie_to_user_sparse)

In [329]:
# Fix wrong input by user
from fuzzywuzzy import fuzz

def get_title_from_id(id):
    # convert idx to title
    return movies[movies['movie_id'] == id]['title'].values[0]

def get_id_from_title(title):
    # convert title to idx
    return movies[movies['title'] == title]['movie_id'].values[0]
    
def matching_score(a, b):
    # calculate the Levenshtein distance to find closest title
        # if exactly same score == 100
    return fuzz.ratio(a, b) 

def find_closest_title(title):
    leven_scores = list(enumerate(movies['title'].apply(matching_score, b = title)))
    sorted_lev_scores = sorted(leven_scores, key = lambda x: x[1], reverse = True)

    closest_idx = sorted_lev_scores[0][0]
    closest_movie = movies.loc[closest_idx]
    distance_score = sorted_lev_scores[0][1]

    closest_movie_id = closest_movie['movie_id']
    closest_movie_title = closest_movie['title']
    
    return (closest_movie_id, closest_movie_title, distance_score)

def rec_movies_by_name(movie_name, n_movie_recs = 10):
    closest_id, closest_movie, distance_score = find_closest_title(movie_name)
    if distance_score != 100:
        movie_name = closest_movie
        print(f"Did you mean {movie_name}?")
    
    movie_id = get_id_from_title(movie_name)

    knn_input = np.asarray([movie_to_user.values[movie_id]])

    movie_ids_list = movie_to_user.index
    n = min(len(movie_ids_list), n_movie_recs)
        # if n_movie_recs goes over number of movies, revert to all movies
    distances, indices = knn_movie_model.kneighbors(knn_input, n_neighbors = n + 1) 

    print(f"Top {n} movies which are very similar to movie {movie_name} are: \n")
    for i in range(1, len(distances[0])):
        movie_id = movie_ids_list[indices[0][i]]
        movie_title = get_title_from_id(movie_id)
        print(movie_title)

In [326]:
movie_title = 'willy woka choco'
rec_movies_by_name(movie_title, 20)

Did you mean Willy Wonka and the Chocolate Factory (1971)?
Top 20 movies which are very similar to movie Willy Wonka and the Chocolate Factory (1971) are: 

Century (1993)
Last of the High Kings, The (a.k.a. Summer Fling) (1996)
Another Man's Poison (1952)
I Don't Want to Talk About It (De eso no se habla) (1993)
Proposition, The (1998)
Number Seventeen (1932)
Daens (1992)
Neon Bible, The (1995)
War at Home, The (1996)
Brother's Kiss, A (1997)
Male and Female (1919)
All Things Fair (1996)
Second Best (1994)
Hangmen Also Die (1943)
I'll Never Forget What's 'is Name (1967)
Bells, The (1926)
With Friends Like These... (1998)
Sweet Nothing (1995)
Macao (1952)
Convent, The (Convento, O) (1995)


In [330]:
movie_title = 'willy woka choco'
rec_movies_by_name(movie_title, 20)

Did you mean Willy Wonka and the Chocolate Factory (1971)?
Top 20 movies which are very similar to movie Willy Wonka and the Chocolate Factory (1971) are: 

Sprung (1997)
Century (1993)
Jeanne and the Perfect Guy (Jeanne et le garçon formidable) (1998)
Nueba Yol (1995)
Trippin' (1999)
Separation, The (La Séparation) (1994)
Brothers in Trouble (1995)
Second Best (1994)
Daens (1992)
Hangmen Also Die (1943)
I Don't Want to Talk About It (De eso no se habla) (1993)
Another Man's Poison (1952)
I'll Never Forget What's 'is Name (1967)
With Friends Like These... (1998)
Last Dance (1996)
Mascara (1999)
Ring, The (1927)
Napoleon and Samantha (1972)
Joyriders, The (1999)
Full Speed (1996)


# Model Building (Hybrid recommendation system)