In [2]:
import os
os.environ["PYSPARK_SUBMIT_ARGS"]='--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell'
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.0.2
      /_/

Using Python version 2.7.6 (default, Mar 22 2014 22:59:56)
SparkSession available as 'spark'.


In [3]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
import seaborn as sns



In [5]:
import numpy as np
import pandas as pd
import time
import json

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, MapType
import pyspark.sql.functions as F

In [6]:
from operator import add
from pyspark.sql.types import *

# Load data

In [7]:
train = spark.read.load("/recsys/lab12data/train.csv", 
                        format="com.databricks.spark.csv", 
                        delimiter=",", 
                        header=True,
                        schema=StructType([StructField("userId",  IntegerType()),
                                           StructField("movieId", IntegerType()),
                                           StructField("rating",  FloatType())]))

In [8]:
train.count()

10531564

In [9]:
# rerun eval notebook with full data set

In [10]:
strain = train

# Baseline with Basic predictiors approach

In [11]:
smooth_buser = 5
smooth_bitem = 10

In [12]:
avg_rating = strain.agg({"rating":"mean"}).collect()[0]

In [13]:
avg_rating = avg_rating[0]
print 'avg rating', avg_rating

avg rating 3.52186384662


In [14]:
buser = strain.groupBy("userId").agg(F.sum(F.col("rating")).alias("sum_ratings"), 
                                     F.count(F.col("rating")).alias("n_user_ratings"))

In [15]:
buser.show(2)

+------+-----------+--------------+
|userId|sum_ratings|n_user_ratings|
+------+-----------+--------------+
|   148|      408.0|           109|
|   463|       20.0|             4|
+------+-----------+--------------+
only showing top 2 rows



In [16]:
x = buser.select("userId", "n_user_ratings", 
                 ((F.col("sum_ratings") - F.lit(avg_rating)*F.col("n_user_ratings"))  / (F.col("n_user_ratings") + F.lit(smooth_buser))).alias("bpu"))

In [17]:
x.show(2)

+------+--------------+-------------------+
|userId|n_user_ratings|                bpu|
+------+--------------+-------------------+
|155959|            60|0.17212568004307688|
|156017|             8| 0.2557760943876923|
+------+--------------+-------------------+
only showing top 2 rows



In [18]:
buser = x

In [19]:
strain.registerTempTable("strain")

In [20]:
buser.registerTempTable("buser")

In [21]:
bitem = spark.sql("""
            select
                t.movieId, sum(t.rating - %g - bu.bpu) as sum_r_normalised, count(*) n_item_ratings
            from strain t
            join buser bu on t.userId == bu.userId 
            group by t.movieId
""" % (avg_rating,))

In [22]:
bitem.show(2)

+-------+------------------+--------------+
|movieId|  sum_r_normalised|n_item_ratings|
+-------+------------------+--------------+
|  11141|4175.3882885569665|          7943|
|  14450| 6078.451993887126|         16762|
+-------+------------------+--------------+
only showing top 2 rows



In [23]:
x = bitem.select("movieId", "n_item_ratings", 
                 ( F.col("sum_r_normalised")  / (F.col("n_item_ratings") + F.lit(smooth_bitem))).alias("bpi"))

In [24]:
x.show(2)

+-------+--------------+------------------+
|movieId|n_item_ratings|               bpi|
+-------+--------------+------------------+
|  14450|         16762|0.3624166464278035|
|   7880|          4105|0.0889908644020725|
+-------+--------------+------------------+
only showing top 2 rows



In [25]:
bitem = x

In [26]:
bitem.registerTempTable("bitem")

## Build prediction for strain set

In [27]:
q = spark.sql("""
        select 
            t.*,
            bu.n_user_ratings,
            bi.n_item_ratings,
            (%g + bu.bpu + bi.bpi) as predicted_rating
        from strain t
        join buser bu on t.userId = bu.userId 
        join bitem bi on t.movieId = bi.movieId
""" % (avg_rating, ))

In [28]:
strain_predictions = q

In [29]:
def calc_rmse(df):
    mse = df.select(F.pow(F.col("predicted_rating") - F.col("rating"), F.lit(2)).alias("err"))\
            .agg({"err":"mean"}).collect()[0][0]
    return np.sqrt(mse)    

In [30]:
rmse = calc_rmse(strain_predictions)

In [31]:
print 'strain RMSE', rmse

strain RMSE 0.867715165859


In [48]:
def cut_1to5(x):
    if x < 1.:
        return 1.0
    if x > 5:
        return 5.0
    return x

udf_cut_1to5 = F.udf(cut_1to5, FloatType())


In [50]:
q = strain_predictions.withColumn("predicted_rating", udf_cut_1to5(F.col("predicted_rating")))

In [51]:
q.show(2)

+------+-------+------+--------------+--------------+----------------+
|userId|movieId|rating|n_user_ratings|n_item_ratings|predicted_rating|
+------+-------+------+--------------+--------------+----------------+
|109268|    148|   3.0|          1454|             2|          2.7021|
| 75052|    148|   3.0|           359|             2|       3.4331956|
+------+-------+------+--------------+--------------+----------------+
only showing top 2 rows



In [52]:
print 'strain RMSE with cut1t5', calc_rmse(q)

strain RMSE with cut1t5 0.867640756856


## Build prediction for test set

In [33]:
test = spark.read.load("/recsys/lab12data/test.csv", 
                        format="com.databricks.spark.csv", 
                        delimiter=",", 
                        header=True,
                        schema=StructType([StructField("userId",  IntegerType()),
                                           StructField("movieId", IntegerType())]))

In [34]:
test.count()

10531564

In [35]:
stest = test

In [36]:
stest.registerTempTable("stest")

In [53]:
q = spark.sql("""
        select 
            t.*,
            bu.n_user_ratings,
            bi.n_item_ratings,
            (%g + coalesce(bu.bpu,0.0) + coalesce(bi.bpi,0.0)) as predicted_rating
        from stest t
        left join buser bu on t.userId = bu.userId 
        left join bitem bi on t.movieId = bi.movieId
""" % (avg_rating, ))

In [54]:
q.show(2)

+------+-------+--------------+--------------+-----------------+
|userId|movieId|n_user_ratings|n_item_ratings| predicted_rating|
+------+-------+--------------+--------------+-----------------+
|178586|    148|          2665|             2|3.227893303449813|
|155572|    148|          1238|             2| 3.61245960840419|
+------+-------+--------------+--------------+-----------------+
only showing top 2 rows



In [55]:
q.filter("predicted_rating is null").count()

0

In [56]:
qs = q.select("userId", "movieId", udf_cut_1to5(F.col("predicted_rating")).alias("rating")).orderBy("userId", "movieId")

In [57]:
test.show(2)

+------+-------+
|userId|movieId|
+------+-------+
|     1|   1414|
|     1|   2346|
+------+-------+
only showing top 2 rows



In [61]:
qs.show(2)

+------+-------+--------+
|userId|movieId|  rating|
+------+-------+--------+
|     1|   1414|4.116101|
|     1|   2346|4.011121|
+------+-------+--------+
only showing top 2 rows



In [62]:
#qs.write.save("lab_12", format="com.databricks.spark.csv")

In [63]:
res = qs.toPandas()

In [64]:
res.head()

Unnamed: 0,userId,movieId,rating
0,1,1414,4.116101
1,1,2346,4.011121
2,1,5278,3.153542
3,1,9303,3.851648
4,1,11817,4.501127


In [65]:
res.to_csv("lab_12.csv", index=False)