In [1]:
import os
import sys

In [2]:
os.environ["PYSPARK_SUBMIT_ARGS"]='--conf spark.sql.catalogImplementation=in-memory pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.3
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
from pyspark.sql.types import *
import pyspark.sql.functions as f

In [4]:
logs_schema = StructType(fields=[
    StructField("userId", IntegerType()),
    StructField("movieId", IntegerType()),
    StructField("rating", FloatType())
])

In [5]:
train_data = spark.read.schema(logs_schema).format("csv").option("sep",',').option("header",'true').load("/labs/lab09data/train.csv")
test_data = spark.read.schema(logs_schema).format("csv").option("sep",',').option("header",'true').load("/labs/lab09data/test.csv")

In [6]:
average_rating = 3.52186

In [7]:
base_user_predictor = train_data.groupBy("userId").agg((1/(f.count(train_data.movieId)+10)*f.sum(train_data.rating-average_rating)).alias('user_predictor'))

In [8]:
films_data_with_user_predictor = train_data.join(base_user_predictor, on='userId', how='inner')

In [9]:
base_film_predictor = films_data_with_user_predictor.groupBy("movieId")\
.agg((1/(f.count(films_data_with_user_predictor.userId)+25)*f.sum(films_data_with_user_predictor.rating
                                -films_data_with_user_predictor.user_predictor-average_rating)).alias('film_predictor'))

In [10]:
combined = test_data.join(base_user_predictor, on='userId', how='left')\
.join(base_film_predictor, on='movieId', how='left').fillna(average_rating)

In [11]:
result_df = combined.select(combined.userId, combined.movieId, (combined.user_predictor+combined.film_predictor+average_rating).alias('rating')).toPandas()

In [12]:
result_df.sort_values(['userId', 'movieId']).to_csv("lab09.csv", index=False)

In [13]:
sc.stop()