In [15]:
import numpy as np
import pandas as pd 
import os
import math
import time
import urllib
import zipfile
from numpy.linalg import svd
import matplotlib.pyplot as plt 

import pyspark
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import countDistinct, col, lit
from pyspark.sql.types import StructType, StructField, IntegerType

# Build our Spark Session and Context
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [2]:
# read in dataframes
pd_ratings = pd.read_csv('../data/movies/ratings.csv')
pd_movies = pd.read_csv('../data/movies/movies.csv')

# Convert a Pandas df to a Spark df
ratings_df = spark.createDataFrame(pd_ratings) 

# For titles later
movies_df = spark.read.csv('../data/movies/movies.csv',header=True)

# # Convert a Spark df to a Pandas df
# pandas_df = spark_df.toPandas()

# Cut out timestamp column
ratings_df = ratings_df.drop(ratings_df['timestamp'])

In [127]:
pd_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [157]:
pd_ratings['movieId'].nunique()

9066

In [160]:
pd_ratings.pivot(index='userId', columns='movieId', values='rating').head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,


In [138]:
(pd_ratings['userId']).nunique()

671

In [153]:
len(pd_movies)

9125

In [143]:
pd_ratings.mean(axis=0)

userId       3.470113e+02
movieId      1.254866e+04
rating       3.543608e+00
timestamp    1.129639e+09
dtype: float64

In [3]:
# To convert predictions matrix into a more interpretable form
def convert_movie_id(pd_ratings, pd_movies):
    for x in range(1,len(pd_movies)):
        pd_ratings.replace({'movieId': x}, pd_movies['title'][x-1],inplace=True)
    return pd_ratings

In [4]:
# convert_movie_id(pd_ratings, pd_movies)
# pd_ratings.head()

In [5]:
# train-test split
train, test = ratings_df.randomSplit([.8, .20], seed=42)

In [6]:
factor_model = ALS(
    itemCol='movieId',
    userCol='userId',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy='drop',
    rank=20)

recommender = factor_model.fit(train)

[Row(userId=1, movieId=31, rating=2.5),
 Row(userId=1, movieId=1172, rating=4.0),
 Row(userId=1, movieId=1263, rating=2.0),
 Row(userId=1, movieId=1287, rating=2.0),
 Row(userId=1, movieId=1293, rating=2.0)]

In [7]:
# Fit model
model_start_time=time.time()
ratings=factor_model.fit(train)
stop_train_time=time.time()
print(stop_train_time-model_start_time)

4.779560565948486


In [8]:
# Make predictions
predict=ratings.transform(test)
transform_test_time=time.time()
print(transform_test_time-stop_train_time)

0.0736396312713623


In [27]:
# Convert a Spark df to a Pandas df
predict_df = predict.toPandas()

In [28]:
predict_df.head()

Unnamed: 0,userId,movieId,rating,prediction
0,588,471,3.0,3.651058
1,350,471,3.0,3.470955
2,306,471,3.0,3.482509
3,491,471,3.0,4.067975
4,299,471,4.5,4.222585


In [32]:
movie_df = predict_df.pivot(index='userId', columns='movieId', values='rating')

movie_cols = movie_df.columns
movie_cols_list = list(movie_cols)
len(movie_cols_list)

title_list = []
for x in range(1,len(movie_cols_list)+1):
    title_list.append(pd_movies['title'][x-1])

movie_df.columns = title_list

In [99]:
# clean up data frame
movie_df.columns = [col.lower().replace(' ', '_') for col in movie_df.columns]
movie_df.fillna(0, inplace = True)

In [117]:
mat = movie_df.values
movies = movie_df.columns
names = movie_df.index

# Compute SVD
U, sigma, VT = svd(mat)

# do 3 topics...for now 
k = 20
topics = ['latent_topic_{}'.format(i) for i in range(k)]

# Keep top k concepts for comparison
U = U[:,:k]
sigma = sigma[:k]
VT = VT[:k,:]

# Make pretty
U, sigma, VT = (np.around(x,2) for x in (U,sigma,VT))
U = pd.DataFrame(U, index = names, columns = topics)
VT = pd.DataFrame(VT, index = topics, columns = movies)

print('\nMatrix U: people-topic')
print(U)
print('\nMatrix S: singular values')
print(sigma)
print('\nMatrix V: topic-movies')
print(VT)


Matrix U: people-topic
        latent_topic_0  latent_topic_1  latent_topic_2  latent_topic_3  \
userId                                                                   
1                -0.00           -0.00            0.00           -0.00   
2                -0.01            0.00           -0.00           -0.00   
3                -0.00            0.00           -0.00           -0.00   
4                -0.04            0.01           -0.01            0.01   
5                -0.02            0.01           -0.01            0.02   
...                ...             ...             ...             ...   
667              -0.01            0.00            0.00            0.01   
668              -0.01            0.01           -0.00            0.01   
669              -0.01           -0.01            0.00           -0.00   
670              -0.01            0.00            0.00            0.00   
671              -0.04            0.02            0.01            0.01   

        laten

In [118]:
title_list = []
for x in range(1,len(movie_cols_list)+1):
    title_list.append(pd_movies['title'][x-1])


In [119]:
movie_df.head()

Unnamed: 0_level_0,toy_story_(1995),jumanji_(1995),grumpier_old_men_(1995),waiting_to_exhale_(1995),father_of_the_bride_part_ii_(1995),heat_(1995),sabrina_(1995),tom_and_huck_(1995),sudden_death_(1995),goldeneye_(1995),...,sunshine_state_(2002),hey_arnold!_the_movie_(2002),mr._deeds_(2002),lovely_&_amazing_(2001),look_who's_talking_now_(1993),like_mike_(2002),men_in_black_ii_(a.k.a._miib)_(a.k.a._mib_2)_(2002),"powerpuff_girls,_the_(2002)","crocodile_hunter:_collision_course,_the_(2002)",reign_of_fire_(2002)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [121]:
# Let's try using NMF instead....
k = 20 # number of topics
from sklearn.decomposition import NMF
nmf = NMF(n_components = k)
nmf.fit(mat)

W = nmf.transform(mat)
H = nmf.components_

W = pd.DataFrame(W, index = names, columns = topics)
H = pd.DataFrame(H, index = topics, columns = movies)

W,H = (np.around(x,2) for x in (W, H))

# this shows the components 
print(W.head(30), '\n\n', H.head(k))

        latent_topic_0  latent_topic_1  latent_topic_2  latent_topic_3  \
userId                                                                   
1                 0.00            0.02            0.00            0.01   
2                 0.00            0.00            0.00            0.04   
3                 0.03            0.00            0.00            0.02   
4                 0.00            0.00            0.00            0.06   
5                 0.00            0.06            0.00            0.00   
6                 0.00            0.02            0.04            0.00   
7                 0.00            0.02            0.07            0.00   
8                 0.00            0.05            0.00            0.00   
9                 0.00            0.00            0.00            0.00   
10                0.00            0.00            0.08            0.01   
11                0.00            0.06            0.00            0.00   
12                0.04            0.00

In [122]:
# Top 10 movies in topic 0
for i in range(20):
    tpic = i
    num_movies = 10
    top_movies = H.iloc[tpic].sort_values(ascending=False).index[:num_movies]
    print('Topic {}:'.format(i), top_movies)

Topic 0: Index(['schindler's_list_(1993)', 'meet_me_in_st._louis_(1944)',
       'coneheads_(1993)', 'angel_on_my_shoulder_(1946)', 'top_hat_(1935)',
       'pinocchio_(1940)', 'ninotchka_(1939)', 'soylent_green_(1973)',
       'hoodlum_(1997)', 'song_of_the_little_road_(pather_panchali)_(1955)'],
      dtype='object')
Topic 1: Index(['thin_line_between_love_and_hate,_a_(1996)', 'to_catch_a_thief_(1955)',
       'preacher's_wife,_the_(1996)', 'great_mouse_detective,_the_(1986)',
       'flower_of_my_secret,_the_(la_flor_de_mi_secreto)_(1995)',
       'vampire_in_brooklyn_(1995)', 'hearts_and_minds_(1996)',
       'kidnapped_(1960)', 'michael_collins_(1996)',
       'american_in_paris,_an_(1951)'],
      dtype='object')
Topic 2: Index(['seven_(a.k.a._se7en)_(1995)', 'getaway,_the_(1994)',
       'candyman:_farewell_to_the_flesh_(1995)',
       'love_in_the_afternoon_(1957)', 'repo_man_(1984)',
       'meet_me_in_st._louis_(1944)', 'cinderella_(1950)',
       'd2:_the_mighty_ducks_(1994)

In [123]:
W.iloc[1]

latent_topic_0     0.00
latent_topic_1     0.00
latent_topic_2     0.00
latent_topic_3     0.04
latent_topic_4     0.00
latent_topic_5     0.00
latent_topic_6     0.00
latent_topic_7     0.01
latent_topic_8     0.00
latent_topic_9     0.06
latent_topic_10    0.04
latent_topic_11    0.02
latent_topic_12    0.00
latent_topic_13    0.00
latent_topic_14    0.00
latent_topic_15    0.06
latent_topic_16    0.00
latent_topic_17    0.01
latent_topic_18    0.00
latent_topic_19    0.00
Name: 2, dtype: float64