In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from kmodes.kprototypes import KPrototypes
import pickle

import seaborn as sns
import matplotlib.pyplot as plt
import time

In [2]:
movies = pd.read_csv('3a movies_TFIDF.csv')
country_codes = pd.read_csv('country_codes.csv')
ratings = pd.read_csv('ratings.csv')

n_of_samples = 4000

In [21]:
def get_numbers(str):
    """Return the integer numbers from a string in an array format."""
    arr = str.replace(',','').replace('{','').replace('}','').replace('[','').replace(']','').split()
    arr = [int(s) for s in arr if s.isdigit()]
    return arr

def get_country_codes(str):
    """Return the codes of the countries from a string."""
    arr = str.replace(',','').replace('{','').replace('}','').replace('[','').replace(']','').replace("'", "").split()
    arr = [s for s in arr if country_codes['Code'].str.contains(s).any()]
    return arr

def prepare_dataframe(movies):
    
    # Conversion
    movies['genres'] = movies['genres'].apply(lambda str: get_numbers(str))
    movies['production_companies'] = movies['production_companies'].apply(lambda str: get_numbers(str))
    movies['production_countries'] = movies['production_countries'].apply(lambda str: get_country_codes(str)) # This takes time
    
    # Get dummies
    mlb = MultiLabelBinarizer()
    genres = pd.DataFrame(mlb.fit_transform(movies['genres']), columns=mlb.classes)

    mlb = MultiLabelBinarizer()
    production_companies = pd.DataFrame(mlb.fit_transform(movies['production_companies']), columns=mlb.classes)

    mlb = MultiLabelBinarizer()
    production_countries = pd.DataFrame(mlb.fit_transform(movies['production_countries']), columns=mlb.classes)

    original_language = pd.get_dummies(movies['original_language'])
    
    # Rename the columns to unique names
    for i in range(genres.shape[1]):
        genres.rename(mapper={i : str(i) + '. genre'}, axis=1, inplace=True)
    
    for i in range(production_countries.shape[1]):
        production_countries.rename(mapper={i : str(i) + '. country'}, axis=1, inplace=True)
    
    movies = movies.drop(['id', 'title'], axis=1)
    
    # production_companies, original languages, production countries are not used, because of huge running time
    numeric = movies.drop(['adult', 'genres', 'original_language', 'production_companies', 'production_countries'], axis=1)
    categorical = movies[['adult']].join(genres)
    
    # Scale the numeric values to (0, 1) interval
    scaler = MinMaxScaler()
    numeric = scaler.fit_transform(numeric)
    
    movievectors = pd.DataFrame(numeric).join(categorical)
    
    return movievectors

In [None]:
movies = prepare_dataframe(movies)

1. Train model and save it:

In [22]:
# Sampling used to reduce running time
movies_sample = movies.sample(n=n_of_samples, random_state=98)

In [None]:
# K-Prototypes
start1 = time.time()
kproto = KPrototypes(n_clusters=30, max_iter=20, n_jobs=4, random_state=98, verbose=2)
kproto.fit(movies_sample, categorical=[i for i in range(57,movies_sample.shape[1])])
end1 = time.time()
    
ellapsed1 = end1-start1

In [None]:
# Saving trained model
with open("kproto_" + str(n_of_samples) + ".pkl", "wb") as f:
   pickle.dump(kproto, f)

2. Suggesting movies with pre-trained model:

In [None]:
with open("kproto_" + str(n_of_samples) + ".pkl", "rb") as f:
   kproto = pickle.load(f)

In [None]:
uid = 30

In [None]:
udf = ratings[ratings['userId']  == uid]

In [None]:
#train, test = train_test_split(udf, random_state=98)

In [24]:
movies

Unnamed: 0,adult,budget,genres,id,original_language,popularity,production_companies,production_countries,release_year,revenue,...,40. overview component,41. overview component,42. overview component,43. overview component,44. overview component,45. overview component,46. overview component,47. overview component,48. overview component,49. overview component
0,1,30000000,"[16, 35, 10751]",862,en,21.946943,[3],['US'],1995.0,373554033.0,...,0.001646,-0.000552,0.000302,-0.001467,-0.000691,0.005511,0.003888,0.006104,-0.002350,-0.003947
1,1,65000000,"[12, 14, 10751]",8844,en,17.015539,"[559, 2550, 10201]",['US'],1995.0,262797249.0,...,0.006327,-0.030701,0.010057,0.042809,-0.010227,-0.005390,-0.014447,-0.014313,-0.029592,0.021556
2,1,0,"[10749, 35]",15602,en,11.712900,"[6194, 19464]",['US'],1995.0,0.0,...,0.000715,-0.032340,-0.025698,-0.027322,-0.035345,0.012567,0.062807,0.075700,-0.044787,0.000582
3,1,16000000,"[35, 18, 10749]",31357,en,3.859495,[306],['US'],1995.0,81452156.0,...,-0.001595,0.005113,0.001585,-0.015870,-0.002421,0.008802,0.005810,0.000887,-0.005891,-0.008063
4,1,0,[35],11862,en,8.387519,"[5842, 9195]",['US'],1995.0,76578911.0,...,-0.025371,0.038725,0.121786,-0.074007,0.003008,0.060951,0.000244,0.057739,-0.066750,-0.059616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43702,1,0,"[14, 18]",465044,en,0.281008,[],['GB'],2017.0,0.0,...,0.028345,-0.042197,0.051238,0.071462,0.045913,-0.103148,-0.032122,-0.027419,-0.011826,-0.006404
43703,1,0,[99],390747,en,1.728108,"[64282, 80147, 89082, 89083]",['US'],2017.0,0.0,...,-0.022517,-0.019188,0.016580,0.047867,0.018591,0.008839,0.026655,0.000493,0.008404,-0.004815
43704,1,0,"[99, 10402]",76498,en,0.113782,[10220],['US'],1976.0,0.0,...,-0.020916,-0.015862,-0.017053,0.006702,-0.014623,-0.005872,-0.004705,-0.008329,-0.005280,0.006894
43705,1,0,[99],413049,en,1.172089,"[37432, 86453]",['US'],2016.0,0.0,...,-0.005142,0.001733,0.020521,0.010041,0.013891,-0.001235,0.010618,0.001785,-0.013161,0.017537


In [None]:
"""
Input: userId
Output: movieId
"""