In [15]:
import numpy as np
import pandas as pd 
import os
import math
import time
import urllib
import zipfile
from numpy.linalg import svd
import matplotlib.pyplot as plt 

import pyspark
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import countDistinct, col, lit
from pyspark.sql.types import StructType, StructField, IntegerType

# Build our Spark Session and Context
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [2]:
# read in dataframes
pd_ratings = pd.read_csv('../data/movies/ratings.csv')
pd_movies = pd.read_csv('../data/movies/movies.csv')

# Convert a Pandas df to a Spark df
ratings_df = spark.createDataFrame(pd_ratings) 

# For titles later
movies_df = spark.read.csv('../data/movies/movies.csv',header=True)

# # Convert a Spark df to a Pandas df
# pandas_df = spark_df.toPandas()

# Cut out timestamp column
ratings_df = ratings_df.drop(ratings_df['timestamp'])

In [3]:
# To convert predictions matrix into a more interpretable form
def convert_movie_id(pd_ratings, pd_movies):
    for x in range(1,len(pd_movies)):
        pd_ratings.replace({'movieId': x}, pd_movies['title'][x-1],inplace=True)
    return pd_ratings

In [4]:
# convert_movie_id(pd_ratings, pd_movies)
# pd_ratings.head()

In [5]:
# train-test split
train, test = ratings_df.randomSplit([.8, .20], seed=42)

In [6]:
factor_model = ALS(
    itemCol='movieId',
    userCol='userId',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy='drop',
    rank=20)

recommender = factor_model.fit(train)

In [7]:
# Fit model
model_start_time=time.time()
ratings=factor_model.fit(train)
stop_train_time=time.time()
print(stop_train_time-model_start_time)

4.779560565948486


In [8]:
# Make predictions
predict=ratings.transform(test)
transform_test_time=time.time()
print(transform_test_time-stop_train_time)

0.0736396312713623


In [27]:
# Convert a Spark df to a Pandas df
predict_df = predict.toPandas()

In [28]:
predict_df.head()

Unnamed: 0,userId,movieId,rating,prediction
0,588,471,3.0,3.651058
1,350,471,3.0,3.470955
2,306,471,3.0,3.482509
3,491,471,3.0,4.067975
4,299,471,4.5,4.222585


In [32]:
movie_df = predict_df.pivot(index='userId', columns='movieId', values='rating')

movie_cols = movie_df.columns
movie_cols_list = list(movie_cols)
len(movie_cols_list)

title_list = []
for x in range(1,len(movie_cols_list)+1):
    title_list.append(pd_movies['title'][x-1])

movie_df.columns = title_list

Unnamed: 0_level_0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Sunshine State (2002),Hey Arnold! The Movie (2002),Mr. Deeds (2002),Lovely & Amazing (2001),Look Who's Talking Now (1993),Like Mike (2002),Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (2002),"Powerpuff Girls, The (2002)","Crocodile Hunter: Collision Course, The (2002)",Reign of Fire (2002)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,3.0,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [88]:
title_list = []
for x in range(1,len(movie_cols_list)+1):
    title_list.append(pd_movies['title'][x-1])


['Toy Story (1995)',
 'Jumanji (1995)',
 'Grumpier Old Men (1995)',
 'Waiting to Exhale (1995)',
 'Father of the Bride Part II (1995)',
 'Heat (1995)',
 'Sabrina (1995)',
 'Tom and Huck (1995)',
 'Sudden Death (1995)',
 'GoldenEye (1995)',
 'American President, The (1995)',
 'Dracula: Dead and Loving It (1995)',
 'Balto (1995)',
 'Nixon (1995)',
 'Cutthroat Island (1995)',
 'Casino (1995)',
 'Sense and Sensibility (1995)',
 'Four Rooms (1995)',
 'Ace Ventura: When Nature Calls (1995)',
 'Money Train (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)',
 'Assassins (1995)',
 'Powder (1995)',
 'Leaving Las Vegas (1995)',
 'Othello (1995)',
 'Now and Then (1995)',
 'Persuasion (1995)',
 'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 'Dangerous Minds (1995)',
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
 'Babe (1995)',
 'Carrington (1995)',
 'Dead Man Walking (1995)',
 'Across the Sea of Time (1995)',
 'It Takes Two

In [38]:
movie_df.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,140174,142448,142488,142997,148626,149406,152081,156607,157296,160438
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [36]:
# clean up data frame
movie_df.columns = [col.replace(' ', '_') for col in movie_df.columns]
movie_df.fillna(0, inplace = True)
movie_df.set_index('name', inplace = True)

AttributeError: 'int' object has no attribute 'replace'