In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler

from kmodes.kprototypes import KPrototypes

import time
import pickle

In [2]:
# Drop ratings of previously dropped movies
# ratings = pd.read_csv('ratings.csv')
# ratings = ratings[ratings['movieId'].isin(pd.read_csv('3b movies_GloVe.csv')['id'])]
# ratings.to_csv('2 ratings.csv', index=False)

In [3]:
# Delete duplicates and write to file
# unprepared_movies = pd.read_csv('3b movies_GloVe.csv')
# unprepared_movies.drop_duplicates(inplace=True)
# duplicates = (unprepared_movies['id'].value_counts() > 1).head(13).index
# unprepared_movies = unprepared_movies[unprepared_movies['id'].isin(duplicates) == False]
# unprepared_movies.to_csv('4 movies.csv', index=False)

In [4]:
movies = pd.read_csv('4 movies.csv')
unprepared_movies = pd.read_csv('4 movies.csv')
country_codes = pd.read_csv('country_codes.csv')
ratings = pd.read_csv('2 ratings.csv')

In [5]:
def get_numbers(str):
    """Return the integer numbers from a string in an array format."""
    arr = str.replace(',','').replace('{','').replace('}','').replace('[','').replace(']','').split()
    arr = [int(s) for s in arr if s.isdigit()]
    return arr

def get_country_codes(str):
    """Return the codes of the countries from a string."""
    arr = str.replace(',','').replace('{','').replace('}','').replace('[','').replace(']','').replace("'", "").split()
    arr = [s for s in arr if country_codes['Code'].str.contains(s).any()]
    return arr

def prepare_dataframe(movies):
    
    # Conversion
    movies['genres'] = movies['genres'].apply(lambda str: get_numbers(str))
    movies['production_companies'] = movies['production_companies'].apply(lambda str: get_numbers(str))
    movies['production_countries'] = movies['production_countries'].apply(lambda str: get_country_codes(str)) # This takes time
    
    # Get dummies
    mlb = MultiLabelBinarizer()
    genres = pd.DataFrame(mlb.fit_transform(movies['genres']), columns=mlb.classes)

    mlb = MultiLabelBinarizer()
    production_companies = pd.DataFrame(mlb.fit_transform(movies['production_companies']), columns=mlb.classes)

    mlb = MultiLabelBinarizer()
    production_countries = pd.DataFrame(mlb.fit_transform(movies['production_countries']), columns=mlb.classes)

    original_language = pd.get_dummies(movies['original_language'])
    
    # Rename the columns to unique names
    for i in range(genres.shape[1]):
        genres.rename(mapper={i : str(i) + '. genre'}, axis=1, inplace=True)
    
    for i in range(production_countries.shape[1]):
        production_countries.rename(mapper={i : str(i) + '. country'}, axis=1, inplace=True)
    
    movieids = movies['id']
    movies = movies.drop(['id', 'title'], axis=1)
    
    # production_companies, original languages, production countries are not used, because of huge running time
    numeric = movies.drop(['adult', 'genres', 'original_language', 'production_companies', 'production_countries'], axis=1)
    categorical = movies[['adult']].join(genres)
    
    # Scale the numeric values to (0, 1) interval
    scaler = MinMaxScaler()
    numeric = scaler.fit_transform(numeric)
    
    movievectors = pd.DataFrame(numeric).join(categorical)
    movievectors.index = movieids
    
    return movievectors

def convert_title_to_index(x):
    row = unprepared_movies2[unprepared_movies2['title'] == x]
    return np.array(row['id'])[0]

In [6]:
movies = prepare_dataframe(movies)

Train model and save it:

In [7]:
# Sampling used to reduce running time
# movies_sample = movies.sample(n=4000, random_state=98)

In [8]:
# start = time.time()
# kproto = KPrototypes(n_clusters=45, max_iter=20, n_jobs=4)
# kproto.fit(movies_sample, categorical=[i for i in range(57,movies_sample.shape[1])])
# end = time.time()
# elapsed = end-start

In [9]:
# elapsed

404.50138235092163

In [10]:
# pickle.dump(kproto, open("kproto_4000.p", 'wb'))

Calcute the clusters of all movies to save time in recommending:

In [11]:
# all_movies_clusters = kproto.predict(movies, categorical=[i for i in range(57,movies.shape[1])])
# pickle.dump(all_movies_clusters, open("clusters_kproto_4000.p", 'wb'))

Recommend movies:

In [12]:
kproto = pickle.load(open("kproto_4000.p", "rb"))
all_movies_clusters = pickle.load(open("clusters_kproto_4000.p", "rb"))

In [13]:
def recommend_movies(uid, unprepared_movies, movies, ratings, cluster_obj, clusters):
    """
    Input:
    uid - ID of the user
    
    Output:
    potential_suggestions - The most popular 20 suggested movies
    
    Error codes:
    -1 - This userID does not exist
    -2 - There are no movies in the current form of movies database that was rated by the user
    """
    
    rated_movies = ratings[ratings['userId'] == uid].drop('userId', axis=1)

    if rated_movies.shape[0] == 0:
        return -1 # Invalid userId

    best_match = find_best_matching_cluster(uid, movies, rated_movies, cluster_obj)
    
    if best_match == -2:
        return -2 # There is no rated movie in movies database

    # Select the movies in the best matching cluster
    potential_suggestions = unprepared_movies[unprepared_movies['id'].isin(movies[clusters == best_match].index)]

    # Delete previously watched (rated) movies
    watched_index = potential_suggestions[potential_suggestions['id'].isin(rated_movies['movieId'])].index
    potential_suggestions = potential_suggestions.drop(watched_index)

    # First suggest the more popular movies
    potential_suggestions = potential_suggestions.sort_values('popularity', ascending=False)

    # Grab the first twenty
    potential_suggestions = potential_suggestions['title'].head(20)
    
    return potential_suggestions

def find_best_matching_cluster(uid, movies, rated_movies, cluster_obj):
    # The clusters of movies that have been rated at least once
    clusters = np.array([])
    
    # The indexes of movies that have not been found in movies database
    # Which indicates error in data set
    indexlist = np.array([])
    
    for index, row in rated_movies.iterrows():
        try:
            clusters = np.append(clusters, cluster_obj.predict(pd.DataFrame(movies.loc[int(row['movieId'])]).transpose(), categorical=[i for i in range(57,movies.shape[1])]))
        except:
            indexlist = np.append(indexlist, index)
    
    # Handle inconsistency in DB
    rated_movies.drop(indexlist, inplace=True)

    rated_movies['clusters'] = clusters

    # Find the highest rated cluster
    if rated_movies.shape[0] > 0:
        best_match = rated_movies[['rating', 'clusters']].groupby('clusters').mean().idxmax()[0]
    else:
        return -2 # There is no rated movie in movies database
    
    return best_match

In [14]:
recommend_movies(5000, unprepared_movies, movies, ratings, kproto, all_movies_clusters)

39955                           Colossal
11512                        Ratatouille
37934                      Sausage Party
19400                 Hotel Transylvania
1324                       Mars Attacks!
2717                    Yellow Submarine
4425                Dream a Little Dream
28425                               Home
11440               Happily N'Ever After
10335                       Corpse Bride
1117                        Delicatessen
13904                         Cold Souls
12292    The Girl Who Leapt Through Time
7034                          Cool World
5329                       Our Man Flint
23938                      The Boxtrolls
7599              Batteries not Included
9326                       The Animatrix
1211                               Akira
2314                     Howard the Duck
Name: title, dtype: object

In [15]:
recommend_movies(235623464, unprepared_movies, movies, ratings, kproto, all_movies_clusters)

-1

Test performance:

In [115]:
# Sampling
unprepared_movies2 = unprepared_movies.sample(frac=0.7, random_state=98)
movies2 = movies[movies.index.isin(unprepared_movies2['id'])]
clusters = pd.DataFrame(movies.index).join(pd.DataFrame(all_movies_clusters, columns=['cluster']))
clusters = clusters[clusters['id'].isin(unprepared_movies2['id'])]['cluster']
clusters = np.array(clusters)
ratings2 = ratings[ratings['movieId'].isin(unprepared_movies2['id'])]

In [None]:
averages = []

# Number of both rated and suggested movies
weight = []

for uid in ratings['userId'].unique():
    suggestions = recommend_movies(uid, unprepared_movies2, movies2, ratings2, kproto, clusters)
    
    if type(suggestions) == int: # Error
        continue
        
    suggestions = suggestions.apply(convert_title_to_index)
    
    personal_ratings = ratings[ratings['userId'] == uid]
    rated_and_suggested_movies = personal_ratings[personal_ratings['movieId'].isin(suggestions)]
    
    if rated_and_suggested_movies.shape[0] != 0:
        weight.append(rated_and_suggested_movies.shape[0])
        averages.append( sum(rated_and_suggested_movies['rating']) / rated_and_suggested_movies.shape[0] )

In [137]:
uid

11

True