# MODEL 1: ALS Estimator Using Spark

### Step 1 : Set up the environment / necessary housekeeping

In [22]:
!pip install --upgrade pip
!pip install -q findspark

Requirement already up-to-date: pip in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (20.0.2)


In [23]:
#install spark
import pyspark
 # get a spark context
sc = pyspark.SparkContext.getOrCreate()
print(sc)
# get a spark session
spark = pyspark.sql.SparkSession.builder.getOrCreate()
print(spark)
spark.version

<SparkContext master=local[*] appName=pyspark-shell>
<pyspark.sql.session.SparkSession object at 0x7f0b47595198>


'2.3.4'

In [24]:
#import findspark
#findspark.init("spark-2.4.5-bin-hadoop2.7")

### Step 2 : Load data

In [25]:
#download the relevant dataset directly from grouplens
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
#unzip the movielens zip file
!unzip -o ml-latest-small.zip

--2020-04-28 13:50:48--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip.2’


2020-04-28 13:50:48 (6.42 MB/s) - ‘ml-latest-small.zip.2’ saved [978202/978202]

Archive:  ml-latest-small.zip
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [26]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import *
import pandas as pd
import numpy as np

In [27]:
# create a SparkSession
spark = SparkSession.builder.getOrCreate()  

data_movie=pd.read_csv("ml-latest-small/movies.csv")
df_movie=spark.createDataFrame(data_movie)
df_movie.show()

data_ratings=pd.read_csv("ml-latest-small/ratings.csv")
df_ratings=spark.createDataFrame(data_ratings)
df_ratings.show()

data_tags=pd.read_csv("ml-latest-small/tags.csv")
df_tags=spark.createDataFrame(data_tags)
df_tags.show()

df_ratings.createOrReplaceTempView('ratings') 
df_movie.createOrReplaceTempView('movie') 
df_tags.createOrReplaceTempView('tags') 

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [28]:
# split into test and training set
(training, test) = df_ratings.randomSplit([0.8, 0.2]) 
#show columns of training dataset
training.printSchema() 
#show the size of training and test dataset
print(training.count()) 
print(test.count())

root
 |-- userId: long (nullable = true)
 |-- movieId: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: long (nullable = true)

80588
20248


### Step 3 : Create a baseline

In [29]:
#get the average rating and show the result
SQL1 = 'SELECT AVG(rating) FROM ratings'
row = spark.sql(SQL1).collect()[0] 

#access Row as a map 
meanRating = row['avg(rating)'] 
print('meanRating',meanRating)

#get the se as rdd
se_rdd = test.rdd.map(lambda row: Row(se = pow(row['rating']-meanRating,2)) ) 
se_df = spark.createDataFrame(se_rdd) 
se_df.createOrReplaceTempView('se')
print('se_df',se_df)

#calculate the average se as rmse
SQL2 = 'SELECT AVG(se) FROM se'
row = spark.sql(SQL2).collect()[0]
meanSE = row['avg(se)'] # access Row as a map 
print('RMSE',pow(meanSE,0.5))

meanRating 3.501556983616962
se_df DataFrame[se: double]
RMSE 1.0477444165970882


### Step 4 : train ALS estimator and perform cross validation

In [30]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

# Build the recommendation model using ALS on the training data
als = ALS(maxIter=3, rank=10, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")

#Set the parameter grid
paramGrid = ParamGridBuilder() \
  .addGrid(als.regParam, [0.03,0.1,0.3]) \
  .addGrid(als.rank, [3,10,30]).build()

regEval = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

#Start the cross validation and fit on the training dataset 
crossVal = CrossValidator(estimator=als, estimatorParamMaps=paramGrid, evaluator=regEval)
cvModel = crossVal.fit(training)

In [31]:
# Show the metrics form the CrossValidation
print(cvModel.avgMetrics) 

# Gives us the parameter combinations
print(cvModel.getEstimatorParamMaps()) 
paramMap = list(zip(cvModel.getEstimatorParamMaps(),cvModel.avgMetrics))

# Print the parameter that gives us the smallest rmse
paramMin = min(paramMap, key=lambda x: x[1])
print(paramMin)

[1.0425361595819607, 1.120488711370013, 1.2083832440684281, 0.9475788147872286, 0.9535126910421409, 0.9436016488068664, 0.9388136990938469, 0.9482871322438656, 0.9565371915485148]
[{Param(parent='ALS_43ada2e1d01b56419307', name='regParam', doc='regularization parameter (>= 0).'): 0.03, Param(parent='ALS_43ada2e1d01b56419307', name='rank', doc='rank of the factorization'): 3}, {Param(parent='ALS_43ada2e1d01b56419307', name='regParam', doc='regularization parameter (>= 0).'): 0.03, Param(parent='ALS_43ada2e1d01b56419307', name='rank', doc='rank of the factorization'): 10}, {Param(parent='ALS_43ada2e1d01b56419307', name='regParam', doc='regularization parameter (>= 0).'): 0.03, Param(parent='ALS_43ada2e1d01b56419307', name='rank', doc='rank of the factorization'): 30}, {Param(parent='ALS_43ada2e1d01b56419307', name='regParam', doc='regularization parameter (>= 0).'): 0.1, Param(parent='ALS_43ada2e1d01b56419307', name='rank', doc='rank of the factorization'): 3}, {Param(parent='ALS_43ada2e

In [32]:
#Implement the als model with the best parameter
als = ALS(maxIter=3, rank=3, regParam=0.3, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [33]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("RMSE = " + str(rmse))

RMSE = 0.9208259381006187


In [34]:
#Get the prediction result
predictions.show()
predictions.createOrReplaceTempView('predictions') 

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|    91|    471|   1.0|1112713817|  3.420579|
|   409|    471|   3.0| 967912821| 3.6859088|
|    57|    471|   3.0| 969753604| 3.3666618|
|   217|    471|   2.0| 955943727| 2.9410295|
|   136|    471|   4.0| 832450058| 3.3150916|
|   171|    471|   3.0| 866905683| 4.3078547|
|   273|    471|   5.0| 835861348|   3.90487|
|   216|    471|   3.0| 975212641| 3.3883548|
|   469|    471|   5.0| 965425364| 3.5002356|
|   426|    471|   5.0|1451081135| 3.3723133|
|   492|    833|   4.0| 863976674| 2.0843596|
|   463|   1088|   3.5|1145460096| 3.4565172|
|   599|   1088|   2.5|1498515232| 2.6232772|
|   221|   1088|   3.0|1111178147| 3.4920092|
|    68|   1088|   3.5|1158534614| 3.0066447|
|   517|   1088|   1.0|1487958398| 2.2101922|
|    19|   1238|   3.0| 965705784| 3.1612732|
|   156|   1238|   4.0| 946799272| 3.9335265|
|   469|   1238|   5.0| 965425364|

### Step 5 : Movie recommendation results

In [35]:
# Get top 5 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()
# Get top 5 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[3567, 5.126542]...|
|   463|[[25947, 5.370457...|
|   496|[[3567, 4.7320633...|
|   148|[[3567, 4.909178]...|
|   540|[[3567, 5.8491163...|
|   392|[[3567, 4.7341595...|
|   243|[[40491, 6.575885...|
|    31|[[6818, 5.632965]...|
|   516|[[3567, 5.20283],...|
|   580|[[25947, 4.971461...|
|   251|[[3567, 6.2263536...|
|   451|[[40491, 5.669374...|
|    85|[[3567, 5.364459]...|
|   137|[[3567, 5.3103585...|
|    65|[[3567, 5.3338385...|
|   458|[[40491, 6.331005...|
|   481|[[3567, 4.654888]...|
|    53|[[40491, 7.361403...|
|   255|[[3567, 3.836444]...|
|   588|[[3567, 4.773695]...|
+------+--------------------+
only showing top 20 rows



In [36]:
# Get top 5 movie recommendations for a specified set of users
users = df_ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 5)

# Get top 5 user recommendations for a specified set of movies
movies = df_ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 5)

In [37]:
df_rec=userRecs.toPandas()
df_rec[df_rec.userId==198].recommendations

523    [(3567, 5.105193138122559), (6818, 5.100881099...
Name: recommendations, dtype: object

# Surprise

In [1]:
!pip install scikit-surprise
import surprise
from surprise import Reader, Dataset



In [10]:

data_ratings = data_ratings[['userId', 'movieId', 'rating', 'timestamp']]
data_ratings = data_ratings.iloc[:, :-1]
reader = Reader()
data = Dataset.load_from_df(data_ratings[['userId', 'movieId', 'rating']], reader)

In [12]:
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data, test_size=0.2)

In [13]:
from surprise import SVD, accuracy
alg_svd = SVD()
alg_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f0b44758cf8>

In [15]:
pred_svd= alg_svd.test(testset)
pred_svd

[Prediction(uid=424, iid=54503, r_ui=4.0, est=3.8650737211165316, details={'was_impossible': False}),
 Prediction(uid=603, iid=156, r_ui=2.0, est=3.4555683174045337, details={'was_impossible': False}),
 Prediction(uid=493, iid=1214, r_ui=2.0, est=4.093950837057857, details={'was_impossible': False}),
 Prediction(uid=357, iid=74458, r_ui=4.0, est=4.248652933115986, details={'was_impossible': False}),
 Prediction(uid=599, iid=1307, r_ui=3.0, est=3.4255894185149813, details={'was_impossible': False}),
 Prediction(uid=21, iid=114818, r_ui=1.0, est=3.186473081561313, details={'was_impossible': False}),
 Prediction(uid=153, iid=6377, r_ui=1.0, est=2.6819338692676884, details={'was_impossible': False}),
 Prediction(uid=208, iid=198, r_ui=2.0, est=3.2681263591649152, details={'was_impossible': False}),
 Prediction(uid=474, iid=5707, r_ui=3.0, est=3.0482276496725045, details={'was_impossible': False}),
 Prediction(uid=122, iid=3949, r_ui=5.0, est=4.849815889508664, details={'was_impossible': Fa

In [17]:
from surprise import accuracy
accuracy.rmse(pred_svd)

RMSE: 0.8624


0.8624259575760455

# Model 2 : SageMaker's Factorization Machines

In [39]:
#necessary imports
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import json_deserializer
from sagemaker.amazon.amazon_estimator import get_image_uri

from scipy.sparse import lil_matrix
import boto3, io, os

In [40]:
#download data from grouplens
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -o ml-100k.zip
#shuffle the data
%cd ml-100k
!shuf ua.base -o ua.base.shuffled

--2020-04-28 14:06:19--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip.1’


2020-04-28 14:06:20 (8.23 MB/s) - ‘ml-100k.zip.1’ saved [4924029/4924029]

Archive:  ml-100k.zip
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating:

In [41]:
#load training data
train_movie_ratings= pd.read_csv('ua.base.shuffled', sep='\t', 
                                       index_col=False, names=['user_id' , 'movie_id' , 'rating'])
#load test data
test_movie_ratings= pd.read_csv('ua.test', sep='\t', 
                                      index_col=False, names=['user_id' , 'movie_id' , 'rating'])
#train_movie_ratings.head()
#test_movie_ratings.head()

In [42]:
n_users= train_movie_ratings['user_id'].max()
n_movies=train_movie_ratings['movie_id'].max()
n_features=n_users+n_movies

n_test_ratings=len(test_movie_ratings.index)
n_train_ratings=len(train_movie_ratings.index)

In [43]:
print (" number of users: ", n_users)
print (" number of movies: ", n_movies)
print (" Training Count: ", n_train_ratings)
print (" Test Count: ", n_test_ratings)
print (" Features (number of users + number of movies): ", n_features)

 number of users:  943
 number of movies:  1682
 Training Count:  90570
 Test Count:  9430
 Features (number of users + number of movies):  2625


In [44]:
#fm input
def loadDataset(data, lines, columns):
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((lines, columns)).astype('float32')
    # Labels are stored in a vector
    y = []
    line=0
    for index, row in data.iterrows():
            X[line,row['user_id']-1] = 1
            X[line, n_users+(row['movie_id']-1)] = 1
            if int(row['rating']) >= 4:
                y.append(1)
            else:
                y.append(0)
            line=line+1

    y=np.array(y).astype('float32')            
    return X,y

X_train, y_train = loadDataset(train_movie_ratings, n_train_ratings, n_features)
X_test, y_test = loadDataset(test_movie_ratings, n_test_ratings, n_features)

In [45]:
print(X_test.shape)
print(y_test.shape)
assert X_test.shape  == (n_test_ratings, n_features)
assert y_test.shape  == (n_test_ratings, )
zero_labels = np.count_nonzero(y_test)
print("Test labels: %d zeros, %d ones" % (zero_labels, n_test_ratings-zero_labels))

(9430, 2625)
(9430,)
Test labels: 5469 zeros, 3961 ones


In [46]:
#Convert to Protobuf format for saving to S3
bucket = 'movierecom'
prefix = 'fm'

train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test')

output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [47]:
def writeDatasetToProtobuf(X, bucket, prefix, key, d_type, y=None):
    Pbuf = io.BytesIO()
    if d_type == "sparse":
        smac.write_spmatrix_to_sparse_tensor(Pbuf, X, labels=y)
    else:
        smac.write_numpy_to_dense_tensor(Pbuf, X, labels=y)
        
    Pbuf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(Pbuf)
    return 's3://{}/{}'.format(bucket,obj)
    
fm_train_data_path = writeDatasetToProtobuf(X_train, bucket, train_prefix, train_key, "sparse", y_train)    
fm_test_data_path  = writeDatasetToProtobuf(X_test, bucket, test_prefix, test_key, "sparse", y_test)    
  
print ("Training data S3 path: ",fm_train_data_path)
print ("Test data S3 path: ",fm_test_data_path)
print ("FM model output S3 path: {}".format(output_prefix))

Training data S3 path:  s3://movierecom/fm/train/train.protobuf
Test data S3 path:  s3://movierecom/fm/test/test.protobuf
FM model output S3 path: s3://movierecom/fm/output


In [48]:
fm = sagemaker.estimator.Estimator(get_image_uri(boto3.Session().region_name, "factorization-machines"),
                                   get_execution_role(), 
                                   train_instance_count=1, 
                                   train_instance_type='ml.m5.large',
                                   output_path=output_prefix,
                                   sagemaker_session=sagemaker.Session())

In [49]:
fm.set_hyperparameters(feature_dim=n_features,
                      predictor_type='binary_classifier',
                      mini_batch_size=1000,
                      num_factors=64,
                      epochs=50)

In [50]:
fm.fit({'train': fm_train_data_path, 'test': fm_test_data_path})

2020-04-28 14:06:55 Starting - Starting the training job...
2020-04-28 14:06:56 Starting - Launching requested ML instances...
2020-04-28 14:07:54 Starting - Preparing the instances for training......
2020-04-28 14:08:40 Downloading - Downloading input data...
2020-04-28 14:09:18 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from numpy.testing import nosetester[0m
[34m[04/28/2020 14:09:33 INFO 140478314145600] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'_tuning_objective_metric': u'', 


2020-04-28 14:09:32 Training - Training image download completed. Training in progress.[34m[2020-04-28 14:09:44.764] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 22, "duration": 893, "num_examples": 91, "num_bytes": 5796480}[0m
[34m[04/28/2020 14:09:44 INFO 140478314145600] #quality_metric: host=algo-1, epoch=10, train binary_classification_accuracy <score>=0.709252747253[0m
[34m[04/28/2020 14:09:44 INFO 140478314145600] #quality_metric: host=algo-1, epoch=10, train binary_classification_cross_entropy <loss>=0.610058519301[0m
[34m[04/28/2020 14:09:44 INFO 140478314145600] #quality_metric: host=algo-1, epoch=10, train binary_f_1.000 <score>=0.763163077143[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 896.2259292602539, "sum": 896.2259292602539, "min": 896.2259292602539}}, "EndTime": 1588082984.764859, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 158808298

[34m[2020-04-28 14:09:54.673] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 44, "duration": 893, "num_examples": 91, "num_bytes": 5796480}[0m
[34m[04/28/2020 14:09:54 INFO 140478314145600] #quality_metric: host=algo-1, epoch=21, train binary_classification_accuracy <score>=0.732340659341[0m
[34m[04/28/2020 14:09:54 INFO 140478314145600] #quality_metric: host=algo-1, epoch=21, train binary_classification_cross_entropy <loss>=0.573220805745[0m
[34m[04/28/2020 14:09:54 INFO 140478314145600] #quality_metric: host=algo-1, epoch=21, train binary_f_1.000 <score>=0.770401093463[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 895.8101272583008, "sum": 895.8101272583008, "min": 895.8101272583008}}, "EndTime": 1588082994.674255, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1588082993.777332}
[0m
[34m[04/28/2020 14:09:54 INFO 140478314145600] #progress_metric: host=al

[34m[2020-04-28 14:10:04.443] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 66, "duration": 886, "num_examples": 91, "num_bytes": 5796480}[0m
[34m[04/28/2020 14:10:04 INFO 140478314145600] #quality_metric: host=algo-1, epoch=32, train binary_classification_accuracy <score>=0.73821978022[0m
[34m[04/28/2020 14:10:04 INFO 140478314145600] #quality_metric: host=algo-1, epoch=32, train binary_classification_cross_entropy <loss>=0.554142837021[0m
[34m[04/28/2020 14:10:04 INFO 140478314145600] #quality_metric: host=algo-1, epoch=32, train binary_f_1.000 <score>=0.77219959072[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 888.3960247039795, "sum": 888.3960247039795, "min": 888.3960247039795}}, "EndTime": 1588083004.443954, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1588083003.554575}
[0m
[34m[04/28/2020 14:10:04 INFO 140478314145600] #progress_metric: host=algo


2020-04-28 14:10:24 Uploading - Uploading generated training model[34m[2020-04-28 14:10:09.896] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 78, "duration": 896, "num_examples": 91, "num_bytes": 5796480}[0m
[34m[04/28/2020 14:10:09 INFO 140478314145600] #quality_metric: host=algo-1, epoch=38, train binary_classification_accuracy <score>=0.739956043956[0m
[34m[04/28/2020 14:10:09 INFO 140478314145600] #quality_metric: host=algo-1, epoch=38, train binary_classification_cross_entropy <loss>=0.54694677399[0m
[34m[04/28/2020 14:10:09 INFO 140478314145600] #quality_metric: host=algo-1, epoch=38, train binary_f_1.000 <score>=0.772776156092[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 899.2660045623779, "sum": 899.2660045623779, "min": 899.2660045623779}}, "EndTime": 1588083009.897833, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1588083008.997622}
[0m
[34m[0


2020-04-28 14:10:31 Completed - Training job completed
Training seconds: 111
Billable seconds: 111


### Hyperparameter tuning

In [51]:
# use the best hyperparameters that give us the highest binary classification accuracy
fm.set_hyperparameters(feature_dim=n_features,
                      predictor_type='binary_classifier',
                      mini_batch_size=359,
                      num_factors=64,
                      epochs=403)

### Deploy the model and predict

In [None]:
#deploy fm model and get the prediction 
fm_predictor = fm.deploy(initial_instance_count=1,
                         instance_type='ml.t2.medium')

------------!

### Show prediction for a particular user using endpoint 

In [72]:
import json
from sagemaker.predictor import json_deserializer

def fm_serializer(data):
    js = {'instances': []}
    for row in data:
        js['instances'].append({'features': row.tolist()})
    return json.dumps(js)

fm_predictor.content_type = 'application/json'
fm_predictor.serializer = fm_serializer
fm_predictor.deserializer = json_deserializer

In [73]:
#try to predict a single record
prediction = X_test[1000].toarray()
result = fm_predictor.predict(prediction)

print(y_test[1000])
print(result)

0.0
{'predictions': [{'score': 0.5855739712715149, 'predicted_label': 1.0}]}


In [74]:
#try to predict a set of records(100)
predictions = []
for array in np.array_split(X_test[1000:1100].toarray(), 1):
    result = fm_predictor.predict(array)
    predictions += [r['predicted_label'] for r in result['predictions']]

predictions = np.array(predictions)
#predictions.shape

In [75]:
#show confusion matrix
pd.crosstab(y_test[1000:1100], predictions, rownames=['actuals'], colnames=['predictions'])

predictions,0.0,1.0
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,49,12
1.0,22,17


In [None]:
# get the recommendation result for each user

In [104]:
def GetUserSparseMatrix(user_id):
    loc_low=user_id*10
    loc_high=loc_low+10
    print(X_test[loc_low:loc_high])

In [105]:
def GetUserRecIndex(user_id):
    loc_low=user_id*10
    loc_high=loc_low+10
    
    predictions = []
    for array in np.array_split(X_test[loc_low:loc_high].toarray(), 1):
        result = fm_predictor.predict(array)
        predictions += [r['predicted_label'] for r in result['predictions']]
    predictions = np.array(predictions)
   
    pred=result["predictions"]
    for i in pred:
        i["num"]=pred.index(i)
    print(pred)
   
    # fiter records with predicted_label being 1.filter lambda function 
    pred_ft= filter(lambda dic: dic["predicted_label"]==1, pred)
    pred_filter=list(pred_ft)
    #get the top 5 movie recommendaions index for each user
    pred_filter.sort(key=lambda x: x['score'], reverse=True)
    top5_num=pred_filter[:5]
    agg_num=[i["num"] for i in top5_num]
    return agg_num  

In [106]:
user_list=[198,11,314,184,163,710,881,504,267,653]

In [107]:
#Get the movie recommendations for userid 198
GetUserSparseMatrix(198)
GetUserRecIndex(198)

  (0, 198)	1.0
  (0, 1163)	1.0
  (1, 198)	1.0
  (1, 1184)	1.0
  (2, 198)	1.0
  (2, 1200)	1.0
  (3, 198)	1.0
  (3, 1210)	1.0
  (4, 198)	1.0
  (4, 1236)	1.0
  (5, 198)	1.0
  (5, 1264)	1.0
  (6, 198)	1.0
  (6, 1266)	1.0
  (7, 198)	1.0
  (7, 1415)	1.0
  (8, 198)	1.0
  (8, 1620)	1.0
  (9, 198)	1.0
  (9, 1930)	1.0
[{'score': 0.5624204874038696, 'predicted_label': 1.0, 'num': 0}, {'score': 0.6367053985595703, 'predicted_label': 1.0, 'num': 1}, {'score': 0.5564891695976257, 'predicted_label': 1.0, 'num': 2}, {'score': 0.6409008502960205, 'predicted_label': 1.0, 'num': 3}, {'score': 0.21699607372283936, 'predicted_label': 0.0, 'num': 4}, {'score': 0.16096925735473633, 'predicted_label': 0.0, 'num': 5}, {'score': 0.38068827986717224, 'predicted_label': 0.0, 'num': 6}, {'score': 0.33846110105514526, 'predicted_label': 0.0, 'num': 7}, {'score': 0.10655821859836578, 'predicted_label': 0.0, 'num': 8}, {'score': 0.19063179194927216, 'predicted_label': 0.0, 'num': 9}]


[3, 1, 0, 2]

In [111]:
#Get the movie recommendations for userid 198
GetUserSparseMatrix(11)
GetUserRecIndex(11)

  (0, 11)	1.0
  (0, 1024)	1.0
  (1, 11)	1.0
  (1, 1038)	1.0
  (2, 11)	1.0
  (2, 1039)	1.0
  (3, 11)	1.0
  (3, 1074)	1.0
  (4, 11)	1.0
  (4, 1085)	1.0
  (5, 11)	1.0
  (5, 1114)	1.0
  (6, 11)	1.0
  (6, 1146)	1.0
  (7, 11)	1.0
  (7, 1242)	1.0
  (8, 11)	1.0
  (8, 1413)	1.0
  (9, 11)	1.0
  (9, 1677)	1.0
[{'score': 0.7292884588241577, 'predicted_label': 1.0, 'num': 0}, {'score': 0.8354696035385132, 'predicted_label': 1.0, 'num': 1}, {'score': 0.7305694222450256, 'predicted_label': 1.0, 'num': 2}, {'score': 0.8095301389694214, 'predicted_label': 1.0, 'num': 3}, {'score': 0.6869974732398987, 'predicted_label': 1.0, 'num': 4}, {'score': 0.881394624710083, 'predicted_label': 1.0, 'num': 5}, {'score': 0.838738203048706, 'predicted_label': 1.0, 'num': 6}, {'score': 0.7037547826766968, 'predicted_label': 1.0, 'num': 7}, {'score': 0.6795014142990112, 'predicted_label': 1.0, 'num': 8}, {'score': 0.7477938532829285, 'predicted_label': 1.0, 'num': 9}]


[5, 6, 1, 3, 9]

In [114]:
#Get the movie recommendations for userid 314
GetUserSparseMatrix(314)
GetUserRecIndex(314)

  (0, 314)	1.0
  (0, 950)	1.0
  (1, 314)	1.0
  (1, 955)	1.0
  (2, 314)	1.0
  (2, 959)	1.0
  (3, 314)	1.0
  (3, 1040)	1.0
  (4, 314)	1.0
  (4, 1127)	1.0
  (5, 314)	1.0
  (5, 1215)	1.0
  (6, 314)	1.0
  (6, 1247)	1.0
  (7, 314)	1.0
  (7, 1373)	1.0
  (8, 314)	1.0
  (8, 1593)	1.0
  (9, 314)	1.0
  (9, 1599)	1.0
[{'score': 0.7389967441558838, 'predicted_label': 1.0, 'num': 0}, {'score': 0.5338139533996582, 'predicted_label': 1.0, 'num': 1}, {'score': 0.40915682911872864, 'predicted_label': 0.0, 'num': 2}, {'score': 0.9366235136985779, 'predicted_label': 1.0, 'num': 3}, {'score': 0.8401138186454773, 'predicted_label': 1.0, 'num': 4}, {'score': 0.510281503200531, 'predicted_label': 1.0, 'num': 5}, {'score': 0.6082781553268433, 'predicted_label': 1.0, 'num': 6}, {'score': 0.5747615694999695, 'predicted_label': 1.0, 'num': 7}, {'score': 0.8331494927406311, 'predicted_label': 1.0, 'num': 8}, {'score': 0.9067280292510986, 'predicted_label': 1.0, 'num': 9}]


[3, 9, 4, 8, 0]

[{'score': 0.5756932497024536, 'predicted_label': 1.0, 'num': 0},
 {'score': 0.852996289730072, 'predicted_label': 1.0, 'num': 1},
 {'score': 0.7887502312660217, 'predicted_label': 1.0, 'num': 2},
 {'score': 0.5380157828330994, 'predicted_label': 1.0, 'num': 3},
 {'score': 0.6953046917915344, 'predicted_label': 1.0, 'num': 4},
 {'score': 0.7003229260444641, 'predicted_label': 1.0, 'num': 5},
 {'score': 0.4947049021720886, 'predicted_label': 0.0, 'num': 6},
 {'score': 0.5453704595565796, 'predicted_label': 1.0, 'num': 7},
 {'score': 0.26833316683769226, 'predicted_label': 0.0, 'num': 8},
 {'score': 0.47330421209335327, 'predicted_label': 0.0, 'num': 9}]

In [117]:
#Get the movie recommendations for userid 184
GetUserSparseMatrix(184)
GetUserRecIndex(184)

  (0, 184)	1.0
  (0, 965)	1.0
  (1, 184)	1.0
  (1, 1056)	1.0
  (2, 184)	1.0
  (2, 1069)	1.0
  (3, 184)	1.0
  (3, 1141)	1.0
  (4, 184)	1.0
  (4, 1158)	1.0
  (5, 184)	1.0
  (5, 1179)	1.0
  (6, 184)	1.0
  (6, 1211)	1.0
  (7, 184)	1.0
  (7, 1229)	1.0
  (8, 184)	1.0
  (8, 1470)	1.0
  (9, 184)	1.0
  (9, 1682)	1.0
[{'score': 0.8078263998031616, 'predicted_label': 1.0, 'num': 0}, {'score': 0.8506773710250854, 'predicted_label': 1.0, 'num': 1}, {'score': 0.9115203619003296, 'predicted_label': 1.0, 'num': 2}, {'score': 0.869451105594635, 'predicted_label': 1.0, 'num': 3}, {'score': 0.7822884321212769, 'predicted_label': 1.0, 'num': 4}, {'score': 0.7223358750343323, 'predicted_label': 1.0, 'num': 5}, {'score': 0.792839765548706, 'predicted_label': 1.0, 'num': 6}, {'score': 0.6182292699813843, 'predicted_label': 1.0, 'num': 7}, {'score': 0.7695117592811584, 'predicted_label': 1.0, 'num': 8}, {'score': 0.5357170701026917, 'predicted_label': 1.0, 'num': 9}]


[2, 3, 1, 0, 6]

In [120]:
#Get the movie recommendations for userid 163
GetUserSparseMatrix(163)
GetUserRecIndex(163)

  (0, 163)	1.0
  (0, 1059)	1.0
  (1, 163)	1.0
  (1, 1223)	1.0
  (2, 163)	1.0
  (2, 1241)	1.0
  (3, 163)	1.0
  (3, 1348)	1.0
  (4, 163)	1.0
  (4, 1400)	1.0
  (5, 163)	1.0
  (5, 1539)	1.0
  (6, 163)	1.0
  (6, 1562)	1.0
  (7, 163)	1.0
  (7, 1627)	1.0
  (8, 163)	1.0
  (8, 1631)	1.0
  (9, 163)	1.0
  (9, 1872)	1.0
[{'score': 0.7355425953865051, 'predicted_label': 1.0, 'num': 0}, {'score': 0.6559515595436096, 'predicted_label': 1.0, 'num': 1}, {'score': 0.5307829976081848, 'predicted_label': 1.0, 'num': 2}, {'score': 0.4976571202278137, 'predicted_label': 0.0, 'num': 3}, {'score': 0.547878086566925, 'predicted_label': 1.0, 'num': 4}, {'score': 0.6790823936462402, 'predicted_label': 1.0, 'num': 5}, {'score': 0.5190690159797668, 'predicted_label': 1.0, 'num': 6}, {'score': 0.5941677093505859, 'predicted_label': 1.0, 'num': 7}, {'score': 0.550855278968811, 'predicted_label': 1.0, 'num': 8}, {'score': 0.5395510196685791, 'predicted_label': 1.0, 'num': 9}]


[0, 5, 1, 7, 8]

In [121]:
#Get the movie recommendations for userid 710
GetUserSparseMatrix(710)
GetUserRecIndex(710)

  (0, 710)	1.0
  (0, 1006)	1.0
  (1, 710)	1.0
  (1, 1037)	1.0
  (2, 710)	1.0
  (2, 1146)	1.0
  (3, 710)	1.0
  (3, 1211)	1.0
  (4, 710)	1.0
  (4, 1285)	1.0
  (5, 710)	1.0
  (5, 1363)	1.0
  (6, 710)	1.0
  (6, 1594)	1.0
  (7, 710)	1.0
  (7, 1626)	1.0
  (8, 710)	1.0
  (8, 1662)	1.0
  (9, 710)	1.0
  (9, 2016)	1.0
[{'score': 0.9315847754478455, 'predicted_label': 1.0, 'num': 0}, {'score': 0.6610450148582458, 'predicted_label': 1.0, 'num': 1}, {'score': 0.730004072189331, 'predicted_label': 1.0, 'num': 2}, {'score': 0.823795735836029, 'predicted_label': 1.0, 'num': 3}, {'score': 0.30516061186790466, 'predicted_label': 0.0, 'num': 4}, {'score': 0.6099883913993835, 'predicted_label': 1.0, 'num': 5}, {'score': 0.6844464540481567, 'predicted_label': 1.0, 'num': 6}, {'score': 0.6452544331550598, 'predicted_label': 1.0, 'num': 7}, {'score': 0.2455539107322693, 'predicted_label': 0.0, 'num': 8}, {'score': 0.2784949839115143, 'predicted_label': 0.0, 'num': 9}]


[0, 3, 2, 6, 1]

In [130]:
#Get the movie recommendations for userid 881
GetUserSparseMatrix(881)
GetUserRecIndex(881)

  (0, 881)	1.0
  (0, 1013)	1.0
  (1, 881)	1.0
  (1, 1082)	1.0
  (2, 881)	1.0
  (2, 1115)	1.0
  (3, 881)	1.0
  (3, 1128)	1.0
  (4, 881)	1.0
  (4, 1135)	1.0
  (5, 881)	1.0
  (5, 1146)	1.0
  (6, 881)	1.0
  (6, 1157)	1.0
  (7, 881)	1.0
  (7, 1351)	1.0
  (8, 881)	1.0
  (8, 1457)	1.0
  (9, 881)	1.0
  (9, 1558)	1.0
[{'score': 0.7156787514686584, 'predicted_label': 1.0, 'num': 0}, {'score': 0.42107969522476196, 'predicted_label': 0.0, 'num': 1}, {'score': 0.8723925948143005, 'predicted_label': 1.0, 'num': 2}, {'score': 0.7373551726341248, 'predicted_label': 1.0, 'num': 3}, {'score': 0.8172374963760376, 'predicted_label': 1.0, 'num': 4}, {'score': 0.8774584531784058, 'predicted_label': 1.0, 'num': 5}, {'score': 0.7977723479270935, 'predicted_label': 1.0, 'num': 6}, {'score': 0.30738526582717896, 'predicted_label': 0.0, 'num': 7}, {'score': 0.8935927152633667, 'predicted_label': 1.0, 'num': 8}, {'score': 0.5588003396987915, 'predicted_label': 1.0, 'num': 9}]


[8, 5, 2, 4, 6]

In [133]:
#Get the movie recommendations for userid 504
GetUserSparseMatrix(504)
GetUserRecIndex(504)

  (0, 504)	1.0
  (0, 1041)	1.0
  (1, 504)	1.0
  (1, 1082)	1.0
  (2, 504)	1.0
  (2, 1129)	1.0
  (3, 504)	1.0
  (3, 1135)	1.0
  (4, 504)	1.0
  (4, 1187)	1.0
  (5, 504)	1.0
  (5, 1440)	1.0
  (6, 504)	1.0
  (6, 1533)	1.0
  (7, 504)	1.0
  (7, 1655)	1.0
  (8, 504)	1.0
  (8, 1690)	1.0
  (9, 504)	1.0
  (9, 1931)	1.0
[{'score': 0.41778478026390076, 'predicted_label': 0.0, 'num': 0}, {'score': 0.4515100121498108, 'predicted_label': 0.0, 'num': 1}, {'score': 0.39301714301109314, 'predicted_label': 0.0, 'num': 2}, {'score': 0.41481295228004456, 'predicted_label': 0.0, 'num': 3}, {'score': 0.457653284072876, 'predicted_label': 0.0, 'num': 4}, {'score': 0.40774717926979065, 'predicted_label': 0.0, 'num': 5}, {'score': 0.4591212570667267, 'predicted_label': 0.0, 'num': 6}, {'score': 0.43700599670410156, 'predicted_label': 0.0, 'num': 7}, {'score': 0.5131455063819885, 'predicted_label': 1.0, 'num': 8}, {'score': 0.4213246703147888, 'predicted_label': 0.0, 'num': 9}]


[8]

In [135]:
#Get the movie recommendations for userid 267
GetUserSparseMatrix(267)
GetUserRecIndex(267)

  (0, 267)	1.0
  (0, 944)	1.0
  (1, 267)	1.0
  (1, 1004)	1.0
  (2, 267)	1.0
  (2, 1087)	1.0
  (3, 267)	1.0
  (3, 1089)	1.0
  (4, 267)	1.0
  (4, 1159)	1.0
  (5, 267)	1.0
  (5, 1173)	1.0
  (6, 267)	1.0
  (6, 1378)	1.0
  (7, 267)	1.0
  (7, 1766)	1.0
  (8, 267)	1.0
  (8, 1872)	1.0
  (9, 267)	1.0
  (9, 1977)	1.0
[{'score': 0.1136145293712616, 'predicted_label': 0.0, 'num': 0}, {'score': 0.11715386062860489, 'predicted_label': 0.0, 'num': 1}, {'score': 0.07176440954208374, 'predicted_label': 0.0, 'num': 2}, {'score': 0.18795707821846008, 'predicted_label': 0.0, 'num': 3}, {'score': 0.17907488346099854, 'predicted_label': 0.0, 'num': 4}, {'score': 0.05686260014772415, 'predicted_label': 0.0, 'num': 5}, {'score': 0.4036732316017151, 'predicted_label': 0.0, 'num': 6}, {'score': 0.1378893405199051, 'predicted_label': 0.0, 'num': 7}, {'score': 0.03702300414443016, 'predicted_label': 0.0, 'num': 8}, {'score': 0.13583563268184662, 'predicted_label': 0.0, 'num': 9}]


[]

In [137]:
#Get the movie recommendations for userid 653
GetUserSparseMatrix(653)
GetUserRecIndex(653)

  (0, 653)	1.0
  (0, 946)	1.0
  (1, 653)	1.0
  (1, 1008)	1.0
  (2, 653)	1.0
  (2, 1164)	1.0
  (3, 653)	1.0
  (3, 1192)	1.0
  (4, 653)	1.0
  (4, 1200)	1.0
  (5, 653)	1.0
  (5, 1260)	1.0
  (6, 653)	1.0
  (6, 1365)	1.0
  (7, 653)	1.0
  (7, 1488)	1.0
  (8, 653)	1.0
  (8, 1500)	1.0
  (9, 653)	1.0
  (9, 1763)	1.0
[{'score': 0.5937554836273193, 'predicted_label': 1.0, 'num': 0}, {'score': 0.5891273021697998, 'predicted_label': 1.0, 'num': 1}, {'score': 0.7062790393829346, 'predicted_label': 1.0, 'num': 2}, {'score': 0.534187912940979, 'predicted_label': 1.0, 'num': 3}, {'score': 0.7313146591186523, 'predicted_label': 1.0, 'num': 4}, {'score': 0.8397791981697083, 'predicted_label': 1.0, 'num': 5}, {'score': 0.7516844868659973, 'predicted_label': 1.0, 'num': 6}, {'score': 0.4163616895675659, 'predicted_label': 0.0, 'num': 7}, {'score': 0.5472491979598999, 'predicted_label': 1.0, 'num': 8}, {'score': 0.37141311168670654, 'predicted_label': 0.0, 'num': 9}]


[5, 6, 4, 2, 0]

# test endpoint

In [None]:
#import sagemaker

#sagemaker.Session().delete_endpoint(fm_predictor.endpoint)

# Model 3 : Surprise