In [1]:
if True:
    import os
    os.environ["PYSPARK_SUBMIT_ARGS"]='--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell'
    execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))
    import os
    import sys
else:
    os.environ['SPARK_HOME'] = '/usr/lib/spark'
    sys.path.insert(0, '/usr/lib/spark/python/lib/py4j-0.9-src.zip')
    sys.path.insert(0, '/usr/lib/spark/python/')
    sys.path.insert(0, '/usr/local/lib64/python2.7/site-packages')
    sys.path.insert(0,'/usr/local/lib/python2.7/site-packages')
    
    from pyspark import SparkContext
    from pyspark.sql import SQLContext, HiveContext
    
    try: sc = SparkContext()
    except: None    
    sqlc = SQLContext(sc)
    spark = sqlc

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.0.2
      /_/

Using Python version 2.7.6 (default, Mar 22 2014 22:59:56)
SparkSession available as 'spark'.


In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
import seaborn as sns



In [4]:
import numpy as np
import pandas as pd
import time
import json

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, MapType
import pyspark.sql.functions as F

In [5]:
from operator import add
from pyspark.sql.types import *

In [6]:
from pyspark.mllib.recommendation import Rating
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel
from datetime import datetime

# Load data

In [7]:
strain_predictions = spark.read.parquet("lab_12/cache/full_train_bp")

In [8]:
strain_predictions = strain_predictions.select("userId", "movieId", "rating")
strain_predictions.show(2)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
| 20833|     31|   4.5|
|201833|     31|   3.5|
+------+-------+------+
only showing top 2 rows



In [9]:
stest_predictions = spark.read.parquet("lab_12/cache/full_test_bp")

In [10]:
stest_predictions.show(2)

+------+-------+--------------+--------------+-------------------+
|userId|movieId|n_user_ratings|n_item_ratings|bp_predicted_rating|
+------+-------+--------------+--------------+-------------------+
|223036|     31|          1376|           161|  3.548151868862599|
|  1051|     31|          1108|           161| 2.1218463332398656|
+------+-------+--------------+--------------+-------------------+
only showing top 2 rows



In [11]:
strain_predictions.registerTempTable("strain")

In [12]:
stest_predictions.registerTempTable("stest")

# Load similarity matrix

In [13]:
def convert_i2i_tupple(line):
    r = line.split(',')
    return (int(r[0]), int(r[1]), float(r[2]), int(r[3]))

def load_i2i_sim():
    raw = sc.textFile("lab_12/i2itop100.csv")\
            .filter(lambda x: not x.startswith("item"))\
            .map(convert_i2i_tupple)
    return raw.toDF(schema=StructType([StructField("item1",  IntegerType()),
                                       StructField("item2", IntegerType()),
                                       StructField("sim",  FloatType()),
                                       StructField("sim_rank",  IntegerType())])) 
    return raw

In [14]:
i2i = load_i2i_sim()

In [16]:
i2i.show(2)

+-----+-----+----------+--------+
|item1|item2|       sim|sim_rank|
+-----+-----+----------+--------+
|    4| 7815| 0.8488138|       0|
|    4|24898|0.83807796|       1|
+-----+-----+----------+--------+
only showing top 2 rows



In [20]:
i2i.registerTempTable("i2i_sim")

# Make predictions on test

In [21]:
q = spark.sql("""
        select
          pr.userId,
          pr.movieId,
          sum(kr.rating * s.sim) /  sum(s.sim) as predicted_rating,
          count(*) as n_used_ratings
        from stest pr
        join i2i_sim s on pr.movieId = s.item1 and s.sim_rank <= 100
        join strain kr on pr.userId = kr.userId and kr.movieId = s.item2
        group by pr.userId, pr.movieId
""")

In [22]:
q.show(2)

+------+-------+-----------------+--------------+
|userId|movieId| predicted_rating|n_used_ratings|
+------+-------+-----------------+--------------+
|   586|  10665|2.748026606639374|             8|
|  1065|  21942|4.353618564178993|            10|
+------+-------+-----------------+--------------+
only showing top 2 rows



In [23]:
predicted_on_test = q

In [24]:
predicted_on_test.registerTempTable("predicted_test")

In [26]:
eval_data = spark.sql("""
        select
          tbase.*,
          coalesce(pt.n_used_ratings, 0) as n_used_item_ratings,
          coalesce(predicted_rating, bp_predicted_rating) as predicted_i2i_rating,
          if(pt.n_used_ratings is not null, 1, 0) as has_i2i_rating
        from stest tbase
        left join predicted_test  pt on tbase.userId = pt.userId and tbase.movieId = pt.movieId
""")

In [27]:
eval_data.cache()

DataFrame[userId: int, movieId: int, n_user_ratings: bigint, n_item_ratings: bigint, bp_predicted_rating: double, n_used_item_ratings: bigint, predicted_i2i_rating: double, has_i2i_rating: int]

In [28]:
eval_data.show(2)

+------+-------+--------------+--------------+-------------------+-------------------+--------------------+--------------+
|userId|movieId|n_user_ratings|n_item_ratings|bp_predicted_rating|n_used_item_ratings|predicted_i2i_rating|has_i2i_rating|
+------+-------+--------------+--------------+-------------------+-------------------+--------------------+--------------+
|    28|   2111|           335|            37| 3.1485874853613227|                  4|  3.2485096794471175|             1|
|    30|  22037|            28|         32123| 3.1463153925797798|                  3|    4.66988878407485|             1|
+------+-------+--------------+--------------+-------------------+-------------------+--------------------+--------------+
only showing top 2 rows



# Save for ensemble cheks

In [29]:
eval_data = eval_data.coalesce(8)

In [30]:
#eval_data.write.parquet("lab_12/cache/full_test_bp_i2i_top100", mode='overwrite')

# Export as csv for submission

In [31]:
res_pd = eval_data.select("userId", "movieId", "bp_predicted_rating", "predicted_i2i_rating").toPandas()

In [32]:
#res_pd.to_csv("/data/home/taras.svirsky/lab12/res/full_test_bp_i2i_top100.csv", index=False)

In [37]:
res_pd.shape

(10531564, 4)