In [1]:
import os
import sys

In [2]:
os.environ["PYSPARK_SUBMIT_ARGS"]='--conf spark.sql.catalogImplementation=in-memory pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.3
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
from pyspark.sql.types import *
from pyspark.sql import Window
import pyspark.sql.functions as f
import json

In [4]:
films_schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("film_id", IntegerType()),
    StructField("score", IntegerType()),
    StructField("time", StringType())
])

In [5]:
films_data = spark.read.schema(films_schema).format('csv').option("sep","\t").load("/labs/lab06data/ml-100k/u.data")

In [6]:
average_rating = films_data.select((f.sum(films_data.score)/f.count(films_data.film_id)).alias("average_rating")).collect()[0]['average_rating']

In [7]:
base_user_predictor = films_data.groupBy("user_id").agg((1/(f.count(films_data.film_id)+10)*f.sum(films_data.score-average_rating)).alias('user_predictor'))

In [8]:
films_data_with_user_predictor = films_data.join(base_user_predictor, on='user_id', how='inner')

In [9]:
base_film_predictor = films_data_with_user_predictor.groupBy("film_id")\
.agg((1/(f.count(films_data_with_user_predictor.user_id)+25)*f.sum(films_data_with_user_predictor.score
                                -films_data_with_user_predictor.user_predictor-average_rating)).alias('film_predictor'))

In [10]:
predictors_combined_all = base_film_predictor.crossJoin(base_user_predictor)\
.withColumn("total_predictor", base_user_predictor.user_predictor+base_film_predictor.film_predictor+average_rating)

In [11]:
predictors_combined = films_data_with_user_predictor.join(base_film_predictor, on='film_id', how='inner')\
.withColumn("total_predictor", films_data_with_user_predictor.user_predictor+base_film_predictor.film_predictor+average_rating)

In [12]:
predictors_relative = predictors_combined.select(predictors_combined.film_id, predictors_combined.user_id,
                                                (predictors_combined.score-predictors_combined.total_predictor)\
                                                 .alias('relative_predictor'))

In [13]:
predictors_relative_2 = predictors_relative.select(predictors_relative.film_id.alias("film_id2"),
                                                  predictors_relative.user_id.alias("user_id2"),
                                                  predictors_relative.relative_predictor.alias("relative_predictor2"))

In [14]:
predictors_relative_all = predictors_relative_2.join(predictors_combined_all, on=((predictors_relative_2.user_id2==predictors_combined_all.user_id)&\
                                                     (predictors_relative_2.film_id2==predictors_combined_all.film_id)), how='right')\
.select("film_id", 'film_predictor','user_id','user_predictor','total_predictor',
        predictors_relative_2.relative_predictor2.alias('relative_predictor')).na.fill(0)

In [15]:
predictors_relative_all_2 = predictors_relative_all.select(predictors_relative_all.film_id.alias("film_id2"),
                                                  predictors_relative_all.user_id.alias("user_id2"),
                                                  predictors_relative_all.relative_predictor.alias("relative_predictor2"))

In [16]:
films_joined = predictors_relative_all.join(predictors_relative_all_2,
                                      on=((predictors_relative_all.user_id == predictors_relative_all_2.user_id2)&\
                                         (predictors_relative_all.film_id!=predictors_relative_all_2.film_id2)), how='inner')

In [17]:
films_distance = films_joined.groupBy('film_id', 'film_id2').agg((f.sum(films_joined.relative_predictor\
                                                     *films_joined.relative_predictor2)\
                                              /f.pow(f.sum(f.pow(films_joined.relative_predictor,2))\
                                                     *f.sum(f.pow(films_joined.relative_predictor2,2)),0.5)).alias("distance")).cache()

In [18]:
user_id = 804

In [19]:
scored_films = list(films_data.filter(films_data.user_id==user_id).select(films_data.film_id).toPandas().film_id)

In [20]:
films_distance_not_scored = films_distance[~films_distance.film_id.isin(scored_films)]

In [21]:
film_window = Window.partitionBy(films_distance_not_scored['film_id']).orderBy(films_distance_not_scored['distance'].desc(),
                                                                              films_distance_not_scored['film_id2'])

In [22]:
neareast_films = films_distance_not_scored.select('*', f.row_number().over(film_window).alias('rank')).where('rank < 31')

In [23]:
predictors_combined_all_cur_user = predictors_combined_all.filter(predictors_combined_all.user_id == user_id)

In [24]:
neareast_films_with_predictors = neareast_films\
.join(predictors_relative_2.filter(predictors_relative_2.user_id2==user_id), on='film_id2', how='inner')

In [25]:
almost_final = neareast_films_with_predictors.groupBy('film_id')\
.agg((f.sum(neareast_films_with_predictors.distance\
            *(neareast_films_with_predictors.relative_predictor2))/f.sum(f.abs(neareast_films_with_predictors.distance)))\
     .alias("score")).join(predictors_combined_all_cur_user, on='film_id', how='inner')

In [26]:
final_1 = almost_final.select(almost_final.film_id, (almost_final.score+almost_final.total_predictor).alias("final_score"))\
.orderBy('final_score', ascending=False).limit(10)

In [27]:
predicators_top10 = list(final_1.toPandas().film_id)

In [28]:
almost_final2 = neareast_films_with_predictors.filter(neareast_films_with_predictors.distance>=0).groupBy('film_id')\
.agg((f.sum(neareast_films_with_predictors.distance\
            *(neareast_films_with_predictors.relative_predictor2))/f.sum(f.abs(neareast_films_with_predictors.distance)))\
     .alias("score")).join(predictors_combined_all_cur_user, on='film_id', how='inner')

In [29]:
final_2 = almost_final2.select(almost_final2.film_id, (almost_final2.score+almost_final2.total_predictor).alias("final_score"))\
.orderBy('final_score', ascending=False).limit(10)

In [30]:
predicators_positive_top10 = list(final_2.toPandas().film_id)

In [31]:
result = {}

In [32]:
result['average_rating'] = average_rating
result['predicators_top10'] = predicators_top10
result['predicators_positive_top10'] = predicators_positive_top10

In [33]:
with open("lab08s.json", 'w') as file:
    json.dump(result, file)

In [34]:
sc.stop()