In [1]:
import pandas as pd
import random
import numpy as np

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler

from utils import tools, data_preprocessing
from MF_algorithms import eALS

In [2]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'/usr/local/opt/apache-spark/libexec'

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

conf = pyspark.SparkConf().set('spark.driver.host','127.0.0.1')
sc = pyspark.SparkContext(master='local', appName='myAppName',conf=conf)
spark = SparkSession(sc)

In [4]:
# Data preprocessing
data_path = "data/data_yelp.csv"
data = data_preprocessing("data/yelp_rating.txt", data_path)

# Creating RDD
rating_df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .option("delimiter", ",")\
    .load(data_path)
    
rating_rdd = rating_df.rdd
rating_list = rating_rdd.collect()
rating_df.show()

# Building Rating matrix
tools = tools(rating_list, data_path)
R = tools.set_data_matrix()

+------+-------+-------+
|rating|user_id|item_id|
+------+-------+-------+
|   4.0|      0|      0|
|   4.0|      0|      1|
|   4.0|      0|      2|
|   2.0|      0|      3|
|   3.0|      0|      4|
|   2.0|      0|      5|
|   4.0|      0|      6|
|   4.0|      0|      7|
|   3.0|      0|      8|
|   4.0|      0|      9|
|   3.0|      0|     10|
|   4.0|      0|     11|
|   2.0|      0|     12|
|   2.0|      0|     13|
|   5.0|      0|     14|
|   4.0|      0|     15|
|   4.0|      0|     16|
|   2.0|      0|     17|
|   3.0|      0|     18|
|   2.0|      0|     19|
+------+-------+-------+
only showing top 20 rows



In [5]:
# Training MF eALS and transforming U and V to minimize loss
MF = eALS(R)
max_iterations = 51
U, V = MF.fit(max_iterations, sc)

Iteration: 1 Loss: 8589.820411207815
Iteration: 6 Loss: 1235.1393791023465
Iteration: 11 Loss: 842.7330110936747
Iteration: 16 Loss: 653.7169362825622
Iteration: 21 Loss: 539.9239614754521
Iteration: 26 Loss: 462.5856323224889
Iteration: 31 Loss: 405.5263087262868
Iteration: 36 Loss: 361.182980401021
Iteration: 41 Loss: 325.51336226055884
Iteration: 46 Loss: 296.1056192215371
Iteration: 51 Loss: 271.38479738051495


In [10]:
# RMSE between predicted ratings R_hat and true ratings R
RMSE = tools.rmse(U, V)
print("RMSE of predicted R and the true R of %.5f" % RMSE)

RMSE of predicted R and the true R of 0.00815
