In [6]:
import pandas as pd
import pyspark
import time
from pyspark.sql import SparkSession
import numpy as np
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

seconds = time.time()

In [7]:
movies=pd.read_csv('../data/movies/movies.csv')
pd_ratings= pd.read_csv('../data/movies/ratings.csv')
pd_ratings=pd_ratings.drop('timestamp', axis=1)
tags=pd.read_csv('../data/movies/tags.csv')
links=pd.read_csv('../data/movies/movies.csv')

In [8]:
pd_ratings.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

 Setup a SparkSession
spark = SparkSession.builder.getOrCreate()
...

# Convert a Pandas DF to a Spark DF
spark_df = spark.createDataFrame(pandas_df) 

# Convert a Spark DF to a Pandas DF
pandas_df = spark_df.toPandas()

In [9]:
spark = SparkSession.builder.getOrCreate()
spark_ratings= spark.createDataFrame(pd_ratings) 
train, test = spark_ratings.randomSplit([0.8, 0.2], seed=42)

In [10]:
factor_model = ALS(
    itemCol='movieId',
    userCol='userId',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy='drop',
    rank=20) 

In [11]:
ratings=factor_model.fit(train)

In [12]:
test.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|   1029|   3.0|
|     1|   1061|   3.0|
|     1|   1129|   2.0|
|     1|   2105|   4.0|
|     1|   2294|   2.0|
|     2|    186|   3.0|
|     2|    300|   3.0|
|     2|    314|   4.0|
|     2|    319|   1.0|
|     2|    364|   3.0|
|     2|    372|   3.0|
|     2|    508|   4.0|
|     2|    550|   3.0|
|     2|    552|   3.0|
|     3|    267|   3.0|
|     3|   2318|   4.0|
|     3|   5349|   3.0|
|     3|   7153|   2.5|
|     3|   7361|   3.0|
|     3|  27369|   3.5|
+------+-------+------+
only showing top 20 rows



In [18]:
predict=ratings.transform(test)

predictions_df = predict.toPandas()
train= train.toPandas()
predictions_df = predict.toPandas().fillna(train['rating'].mean())

test_pd=test.toPandas()
predictions_df['squared_error'] = (predictions_df['rating'] - predictions_df['prediction'])**2
#print (predictions_df)

In [19]:
np.sqrt(sum(predictions_df['squared_error']) / len(predictions_df))

0.905366100045955

In [20]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predict)

In [21]:
print (rmse)

0.9053661000459691


In [22]:
def top_x(n):
    counter=np.linspace(1,n, num=n)
    top_choices=[]
    for place in range(1,n+1):
        if place == 1:
            choice = '1st'
        if place == 2: 
            choice = '2nd'
        if place == 3:  
            choice = '3rd'
        else:
            choice = str(place) + 'th'
        top_choices.append(choice)
    return top_choices


In [23]:
n=10
col_names=top_x(n)
userRecs = ratings.recommendForAllUsers(n)
# Generate top 10 user recommendations for each movie
movieRecs = ratings.recommendForAllItems(n)
col_names

['1th', '2th', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']

In [47]:
best_movies=userRecs.toPandas()
best_movies.recommendations[0]

[Row(movieId=83411, rating=4.8572258949279785),
 Row(movieId=67504, rating=4.8572258949279785),
 Row(movieId=83318, rating=4.8572258949279785),
 Row(movieId=59684, rating=4.756404399871826),
 Row(movieId=31435, rating=4.756404399871826),
 Row(movieId=3414, rating=4.695611953735352),
 Row(movieId=3038, rating=4.569250106811523),
 Row(movieId=134130, rating=4.555432319641113),
 Row(movieId=76173, rating=4.537815570831299),
 Row(movieId=8132, rating=4.528042316436768)]

In [39]:
recs.head()

Unnamed: 0,0,1
0,83411,4.857226
1,67504,4.857226
2,83318,4.857226
3,59684,4.756404
4,31435,4.756404


In [45]:
recs = best_movies.recommendations[0]
recs = pd.DataFrame(recs)
recs.columns = ['movieId','ratings']

for x in range(1,len(pd_movies)):
    recs.replace({'movieId': x}, pd_movies['title'][x-1],inplace=True)

Unnamed: 0,movieId,ratings
0,83411,4.857226
1,67504,4.857226
2,83318,4.857226
3,59684,4.756404
4,31435,4.756404


In [16]:
#df[['b1', 'b2']] = pd.DataFrame(df['b'].tolist(), index=df.index)

ranked=pd.DataFrame(best_movies['recommendations'].tolist(), index=best_movies.index)

#best_movies[['Best','2nd,','3rd']]=pd.DataFrame(best_movies['recommendations'], index=best_movies.index)