In [1]:
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
import numpy as np
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [2]:
movies=pd.read_csv('../data/movies/movies.csv')
pd_ratings= pd.read_csv('../data/movies/ratings.csv')
pd_ratings=pd_ratings.drop('timestamp', axis=1)
tags=pd.read_csv('../data/movies/tags.csv')
links=pd.read_csv('../data/movies/movies.csv')

In [3]:
pd_ratings.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

 Setup a SparkSession
spark = SparkSession.builder.getOrCreate()
...

# Convert a Pandas DF to a Spark DF
spark_df = spark.createDataFrame(pandas_df) 

# Convert a Spark DF to a Pandas DF
pandas_df = spark_df.toPandas()

In [4]:
spark = SparkSession.builder.getOrCreate()
spark_ratings= spark.createDataFrame(pd_ratings) 
train, test = spark_ratings.randomSplit([0.8, 0.2], seed=42)

In [5]:
factor_model = ALS(
    itemCol='movieId',
    userCol='userId',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy='drop',
    rank=20) 

In [6]:
ratings=factor_model.fit(train)

In [7]:
test.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|   1029|   3.0|
|     1|   1061|   3.0|
|     1|   1129|   2.0|
|     1|   2105|   4.0|
|     1|   2294|   2.0|
|     2|    186|   3.0|
|     2|    300|   3.0|
|     2|    314|   4.0|
|     2|    319|   1.0|
|     2|    364|   3.0|
|     2|    372|   3.0|
|     2|    508|   4.0|
|     2|    550|   3.0|
|     2|    552|   3.0|
|     3|    267|   3.0|
|     3|   2318|   4.0|
|     3|   5349|   3.0|
|     3|   7153|   2.5|
|     3|   7361|   3.0|
|     3|  27369|   3.5|
+------+-------+------+
only showing top 20 rows



In [8]:
predict=ratings.transform(test)

predictions_df = predict.toPandas()
train= train.toPandas()
predictions_df = predict.toPandas().fillna(train['rating'].mean())

test_pd=test.toPandas()
predictions_df['squared_error'] = (predictions_df['rating'] - predictions_df['prediction'])**2
#print (predictions_df)


In [9]:
np.sqrt(sum(predictions_df['squared_error']) / len(predictions_df))

0.9034773158546282

In [10]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predict)

In [11]:
print (rmse)

0.9034773158546396


In [12]:
userRecs = ratings.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = ratings.recommendForAllItems(10)

In [13]:
best_movies=userRecs.toPandas()

In [14]:
best_movies.head()

Unnamed: 0,userId,recommendations
0,471,"[(83411, 5.036449909210205), (67504, 5.0364499..."
1,463,"[(83411, 5.122406005859375), (67504, 5.1224060..."
2,496,"[(76173, 5.291072368621826), (59684, 5.2908868..."
3,148,"[(83411, 5.813085556030273), (67504, 5.8130855..."
4,540,"[(2071, 5.177724361419678), (76173, 5.15821218..."


baselines.py  Untitled1.ipynb  Untitled.ipynb


'/home/jovyan/work/Documents/EDA/recommender-case-study/src'