In [1]:
if True:
    import os
    os.environ["PYSPARK_SUBMIT_ARGS"]='--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell'
    execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))
    import os
    import sys
else:
    os.environ['SPARK_HOME'] = '/usr/lib/spark'
    sys.path.insert(0, '/usr/lib/spark/python/lib/py4j-0.9-src.zip')
    sys.path.insert(0, '/usr/lib/spark/python/')
    sys.path.insert(0, '/usr/local/lib64/python2.7/site-packages')
    sys.path.insert(0,'/usr/local/lib/python2.7/site-packages')
    
    from pyspark import SparkContext
    from pyspark.sql import SQLContext, HiveContext
    
    try: sc = SparkContext()
    except: None    
    sqlc = SQLContext(sc)
    spark = sqlc

In [3]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
import seaborn as sns

In [5]:
import numpy as np
import pandas as pd
import time
import json

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, MapType
import pyspark.sql.functions as F

In [6]:
from operator import add
from pyspark.sql.types import *

In [7]:
from pyspark.mllib.recommendation import Rating
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel
from datetime import datetime

# Load data

In [9]:
strain_predictions = spark.read.parquet("lab_12/cache/strain_bp")

In [17]:
strain_predictions = strain_predictions.select("userId", "movieId", "rating")
strain_predictions.show(2)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|170647|     31|   4.5|
|198447|     31|   2.0|
+------+-------+------+
only showing top 2 rows



In [11]:
stest_predictions = spark.read.parquet("lab_12/cache/stest_bp")

In [12]:
stest_predictions.show(2)

+------+-------+------+--------------+--------------+-----------+-----------+-----------------+-------------------+
|userId|movieId|rating|n_user_ratings|n_item_ratings|nr_user_cat|nr_item_cat|        bp_rating|      bp_rating_err|
+------+-------+------+--------------+--------------+-----------+-----------+-----------------+-------------------+
|227044|     31|   3.5|          20.0|         109.0|  unr_06_50|    inr_101|4.036272701087659|-0.5362727010876593|
| 38648|     31|   4.0|         225.0|         109.0|     unr_51|    inr_101|4.524626216198095|-0.5246262161980946|
+------+-------+------+--------------+--------------+-----------+-----------+-----------------+-------------------+
only showing top 2 rows



In [19]:
strain_predictions.registerTempTable("strain")

In [20]:
stest_predictions.registerTempTable("stest")

# Load similarity matrix

In [14]:
def convert_i2i_tupple(line):
    r = line.split(',')
    return (int(r[0]), int(r[1]), float(r[2]), int(r[3]))

def load_i2i_sim():
    raw = sc.textFile("lab_12/i2itop100.csv")\
            .filter(lambda x: not x.startswith("item"))\
            .map(convert_i2i_tupple)
    return raw.toDF(schema=StructType([StructField("item1",  IntegerType()),
                                       StructField("item2", IntegerType()),
                                       StructField("sim",  FloatType()),
                                       StructField("sim_rank",  IntegerType())])) 
    return raw

In [15]:
i2i = load_i2i_sim()

In [16]:
i2i.show(2)

+-----+-----+----------+--------+
|item1|item2|       sim|sim_rank|
+-----+-----+----------+--------+
|    4| 7815| 0.8488138|       0|
|    4|24898|0.83807796|       1|
+-----+-----+----------+--------+
only showing top 2 rows



In [18]:
i2i.registerTempTable("i2i_sim")

# Make predictions on test

In [82]:
q = spark.sql("""
        select
          pr.userId,
          pr.movieId,
          sum(kr.rating * s.sim) /  sum(s.sim) as predicted_rating,
          count(*) as n_used_ratings
        from stest pr
        join i2i_sim s on pr.movieId = s.item1 and s.sim_rank <= 50
        join strain kr on pr.userId = kr.userId and kr.movieId = s.item2
        group by pr.userId, pr.movieId
""")

In [83]:
q.show(2)

+------+-------+-----------------+--------------+
|userId|movieId| predicted_rating|n_used_ratings|
+------+-------+-----------------+--------------+
|  2407|   1288|3.555305428659686|             9|
|  5984|  11339|4.333561980799907|             3|
+------+-------+-----------------+--------------+
only showing top 2 rows



In [84]:
predicted_on_test = q

In [85]:
predicted_on_test.registerTempTable("predicted_test")

In [86]:
eval_data = spark.sql("""
        select
          tbase.*,
          coalesce(pt.n_used_ratings, 0) as n_used_item_ratings,
          coalesce(predicted_rating, bp_rating) as predicted_rating,
          if(pt.n_used_ratings is not null, 1, 0) as has_i2i_rating
        from stest tbase
        left join predicted_test  pt on tbase.userId = pt.userId and tbase.movieId = pt.movieId
""")

In [87]:
eval_data.cache()

DataFrame[userId: int, movieId: int, rating: float, n_user_ratings: decimal(21,1), n_item_ratings: decimal(21,1), nr_user_cat: string, nr_item_cat: string, bp_rating: double, bp_rating_err: double, n_used_item_ratings: bigint, predicted_rating: double, has_i2i_rating: int]

In [88]:
eval_data.show(2)

+------+-------+------+--------------+--------------+-----------+-----------+-----------------+------------------+-------------------+------------------+--------------+
|userId|movieId|rating|n_user_ratings|n_item_ratings|nr_user_cat|nr_item_cat|        bp_rating|     bp_rating_err|n_used_item_ratings|  predicted_rating|has_i2i_rating|
+------+-------+------+--------------+--------------+-----------+-----------+-----------------+------------------+-------------------+------------------+--------------+
|    28|   4711|   3.0|         244.0|        7697.0|     unr_51|    inr_101|3.564750454655524|-0.564750454655524|                  4| 3.372809845588842|             1|
|    40|  13667|   3.0|         129.0|        2423.0|     unr_51|    inr_101|3.778930846940237|-0.778930846940237|                  2|2.4987220990293846|             1|
+------+-------+------+--------------+--------------+-----------+-----------+-----------------+------------------+-------------------+------------------+--

In [89]:
msedf=eval_data.select(F.pow(F.col("rating")-F.col("predicted_rating"), 2).alias("e2"))\
               .agg(F.avg(F.col("e2")).alias("mse"))

In [90]:
print 'rmse on all test set:', np.sqrt(msedf.collect()[0][0])

rmse on all test set: 0.92366072949


In [91]:
msedf_i2ionly=eval_data.filter(F.col("has_i2i_rating")==1) \
                       .select(F.pow(F.col("rating")-F.col("predicted_rating"), 2).alias("e2"))\
                       .agg(F.avg(F.col("e2")).alias("mse"))

In [92]:
print 'rmse on records with i2i predictions:', np.sqrt(msedf_i2ionly.collect()[0][0])

rmse on records with i2i predictions: 0.903226886361


In [93]:
msedf_i2ionly_t3=eval_data.filter(F.col("n_used_item_ratings")>=3) \
                       .select(F.pow(F.col("rating")-F.col("predicted_rating"), 2).alias("e2"))\
                       .agg(F.avg(F.col("e2")).alias("mse"))

In [94]:
print 'rmse on records with at least 3 similar i2i predictions:', np.sqrt(msedf_i2ionly_t3.collect()[0][0])

rmse on records with at least 3 similar i2i predictions: 0.772533851324


In [95]:
qg = eval_data.withColumn("e2", F.pow(F.col("rating")-F.col("predicted_rating"), 2))\
             .groupBy(["nr_user_cat", "nr_item_cat"])\
             .agg(F.avg(F.col("e2")).alias("mse"), 
                  F.count(F.col("e2")).alias("n_examples"),
                  F.sum(F.col("has_i2i_rating")).alias("has_i2i_rating"),
                  F.avg(F.col("n_used_item_ratings")).alias("avg_used_item_ratings"))

In [96]:
epg = qg.toPandas()

In [97]:
epg['r_examples'] = np.round(epg['n_examples'] * 100. /  epg['n_examples'].sum(), 2)
epg['rmse'] = np.sqrt(epg['mse'])
epg['r_has_i2i_rating'] = np.round(epg['has_i2i_rating'] * 100. /  epg['n_examples'], 2)

In [98]:
epg.sort_values(by='r_examples', ascending=False)

Unnamed: 0,nr_user_cat,nr_item_cat,mse,n_examples,has_i2i_rating,avg_used_item_ratings,r_examples,rmse,r_has_i2i_rating
0,unr_51,inr_101,0.805691,2016442,1480306,2.180289,63.81,0.897603,73.41
8,unr_06_50,inr_101,0.926577,900019,317092,0.572015,28.48,0.962589,35.23
3,unr_01_05,inr_101,1.105219,122519,7303,0.063762,3.88,1.051294,5.96
4,unr_51,inr_11_100,0.785619,88496,58958,2.315268,2.8,0.886352,66.62
11,unr_51,inr_01_10,0.683332,12069,7988,3.352639,0.38,0.826639,66.19
9,unr_06_50,inr_11_100,1.104292,11725,2342,0.299531,0.37,1.050853,19.97
13,unr_0,inr_101,1.312901,3962,0,0.0,0.13,1.145819,0.0
1,unr_51,inr_0,1.00206,1610,1406,15.06087,0.05,1.001029,87.33
6,unr_01_05,inr_11_100,1.405741,1572,33,0.021628,0.05,1.185639,2.1
5,unr_06_50,inr_01_10,1.255837,1130,213,0.302655,0.04,1.120642,18.85


In [99]:
print eval_data.filter(F.col("n_used_item_ratings")>=3).count()
print eval_data.count()

759878
3159915


In [72]:
eval_data.show(2)

+------+-------+------+--------------+--------------+-----------+-----------+-----------------+------------------+-------------------+------------------+--------------+
|userId|movieId|rating|n_user_ratings|n_item_ratings|nr_user_cat|nr_item_cat|        bp_rating|     bp_rating_err|n_used_item_ratings|  predicted_rating|has_i2i_rating|
+------+-------+------+--------------+--------------+-----------+-----------+-----------------+------------------+-------------------+------------------+--------------+
|    28|   4711|   3.0|         244.0|        7697.0|     unr_51|    inr_101|3.564750454655524|-0.564750454655524|                  4| 3.372809845588842|             1|
|    40|  13667|   3.0|         129.0|        2423.0|     unr_51|    inr_101|3.778930846940237|-0.778930846940237|                  2|2.4987220990293846|             1|
+------+-------+------+--------------+--------------+-----------+-----------+-----------------+------------------+-------------------+------------------+--

# Save for ensemble cheks

In [100]:
eval_data = eval_data.withColumnRenamed("predicted_rating", 'predicted_i2i_rating')

In [101]:
eval_data = eval_data.coalesce(8)

In [102]:
eval_data.write.parquet("lab_12/cache/stest_bp_i2i_top50", mode='overwrite')