In [1]:
import os
import sys

In [2]:
os.environ["PYSPARK_SUBMIT_ARGS"]='--conf spark.sql.catalogImplementation=in-memory pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.3
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
from pyspark.sql.types import *
import pyspark.sql.functions as f
import json

In [4]:
films_schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("film_id", IntegerType()),
    StructField("score", IntegerType()),
    StructField("time", StringType())
])

In [5]:
films_data = spark.read.schema(films_schema).format('csv').option("sep","\t").load("/labs/lab06data/ml-100k/u.data")

In [6]:
users_amount = films_data.groupBy("user_id").count()

In [7]:
average_user_ratings = films_data.select((f.count("score")/f.countDistinct("user_id")).alias("average_user_ratings"))\
.collect()[0]['average_user_ratings']

In [8]:
average_film_ratings = films_data.select((f.count("score")/f.countDistinct("film_id")).alias("average_film_ratings"))\
.collect()[0]['average_film_ratings']

In [9]:
completeness = films_data.select((f.count("score")/(f.countDistinct("film_id")*f.countDistinct("user_id")))\
                                .alias("completeness"))\
.collect()[0]['completeness']

In [10]:
user_id = 804

In [11]:
user_avg_score = films_data.groupBy("user_id").agg((f.sum('score')/f.count('*')).alias('avg_score'))

In [12]:
films_data_avg_score = films_data.join(user_avg_score, on='user_id', how='inner')

In [13]:
films_rel_score = films_data_avg_score.select(films_data_avg_score.user_id,
                           films_data_avg_score.film_id,
                           (films_data_avg_score.score-films_data_avg_score.avg_score).alias("rel_score"))

In [14]:
my_user_data = films_rel_score.filter(films_rel_score.user_id==user_id)

In [15]:
another_users_data = films_rel_score.filter(films_rel_score.user_id!=user_id).select(films_rel_score.user_id.alias("user_id2"),
                                                                          films_rel_score.film_id,
                                                                          films_rel_score.rel_score.alias("rel_score2"))

In [16]:
films_joined = my_user_data.join(another_users_data, on='film_id', how='inner')

In [17]:
@f.pandas_udf(DoubleType())
def get_score(amounts):
    return amounts.apply(lambda x: min(1, x/50))

In [18]:
user_scores = films_joined.groupBy("user_id2").agg((f.sum(films_joined.rel_score*films_joined.rel_score2)/f.pow((f.sum(f.pow(films_joined.rel_score,2))*\
                                                                                           f.sum(f.pow(films_joined.rel_score2,2))),0.5)).alias("dist"),
                                    f.count('user_id2').alias('film_amount')).withColumn('correction', get_score("film_amount"))

In [19]:
user_final_scores = user_scores.select(user_scores.user_id2, (user_scores.dist*user_scores.correction).alias("final_score"))\
.orderBy('final_score',ascending=False).limit(30)

In [20]:
top30_users = user_final_scores.toPandas()

In [21]:
pearson_neighbours = list(top30_users.user_id2)

In [22]:
final_score_with_rel_score = user_final_scores.join(another_users_data, on='user_id2', how='inner')

In [23]:
films_scores = final_score_with_rel_score.groupBy('film_id').agg(
(f.sum(final_score_with_rel_score.final_score*final_score_with_rel_score.rel_score2)\
    /f.sum(f.abs(final_score_with_rel_score.final_score))).alias('film_score')).orderBy('film_score', 'film_id',ascending=False)

In [24]:
cur_user_films = list(my_user_data.toPandas().film_id)

In [25]:
pearson_top10 = list(films_scores[~films_scores.film_id.isin(cur_user_films)].limit(10).toPandas().film_id)

In [26]:
result = {
    "average_film_ratings":average_film_ratings,
    "average_user_ratings":average_user_ratings,
    "completeness":completeness,
    "pearson_neighbours":pearson_neighbours,
    "pearson_top10":pearson_top10
}

In [27]:
with open("lab08.json", 'w') as file:
    json.dump(result, file)

In [28]:
sc.stop()