In [7]:
import os
import pyspark

from pyspark import SparkContext
from pyspark.mllib.evaluation import RankingMetrics

sc = SparkContext()

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[2]) created by __init__ at <ipython-input-1-15ea84b661dc>:7 

In [8]:
import itertools
import pandas as pd

In [24]:
taste_file = os.path.join('.','data','subset_train_taste_profile.csv')
taste_raw_data = sc.textFile(taste_file)
taste_raw_data_header = taste_raw_data.take(1)[0]
# Remove first row as header, split each row into token
taste_data = taste_raw_data.filter(lambda line: line!=taste_raw_data_header)\
            .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),int(tokens[1]),int(tokens[2]))).cache()

In [25]:
test_file = os.path.join('.','data','subset_test_taste_profile.csv')
test_raw_data = sc.textFile(test_file)
test_raw_data_header = test_raw_data.take(1)[0]
# Remove first row as header, split each row into token
test_data = test_raw_data.filter(lambda line: line!=test_raw_data_header)\
            .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),int(tokens[1]),int(tokens[2]))).cache()

In [26]:
taste_data.count(), test_data.count()

(2538996, 2552008)

In [27]:
# Model selection (ALS parameter tuning)
from pyspark.mllib.recommendation import ALS
import math

seed = 5L
iterations = 10
regularization_parameter = 0.1
rank = 8
errors = 0
err = 0
tolerance = 0.02

model = ALS.trainImplicit(taste_data, rank, seed=seed, iterations=iterations, lambda_=regularization_parameter)

In [None]:
# Evaluate model with Mean average precision
# Prepare prediction (recommendation)
k = 500


userRecommended = model.recommendProductsForUsers(k)
user_reco = userRecommended.map(lambda x: (x[0], [r.product for r in x[1]]))

# Labels data
#user_songs = taste_data.map(lambda x: (x[0], x[1])).groupByKey().mapValues(list)
user_songs = test_data.map(lambda x: (x[0], x[1])).groupByKey().mapValues(list)

predictionAndLabels = user_reco.join(user_songs)
test_predictionAndLabels = predictionAndLabels.map(lambda x: x[1])

metrics = RankingMetrics(test_predictionAndLabels)
print metrics.meanAveragePrecision

In [14]:
0.236833220292

0.236833220292

In [15]:
ranks = [8, 12, 20, 40]
regularization_parameters  = [0.1, 1.0, 10.0]
iterations = [10, 20]

for rank, regularization_param, numIter in itertools.product(ranks, regularization_parameters, iterations):
    print rank, regularization_param, numIter

8 0.1 10
8 0.1 20
8 1.0 10
8 1.0 20
8 10.0 10
8 10.0 20
12 0.1 10
12 0.1 20
12 1.0 10
12 1.0 20
12 10.0 10
12 10.0 20


In [16]:
#ranks = [8, 12, 20, 40]
#regularization_parameters  = [0.1, 1.0, 10.0]
#iterations = [10, 20]
result_list = []
ranks = [8, 12]
regularization_parameters  = [0.1]
iterations = [10]
k = 500

for rank, regularization_param, numIter in itertools.product(ranks, regularization_parameters, iterations):
    model = ALS.trainImplicit(taste_data, rank, iterations=numIter, lambda_=regularization_param)
    userRecommended = model.recommendProductsForUsers(k)
    user_reco = userRecommended.map(lambda x: (x[0], [r.product for r in x[1]]))

    # Labels data
    user_songs = test_data.map(lambda x: (x[0], x[1])).groupByKey().mapValues(list)

    predictionAndLabels = user_reco.join(user_songs)
    test_predictionAndLabels = predictionAndLabels.map(lambda x: x[1])

    metrics = RankingMetrics(test_predictionAndLabels)
    result_list.append((rank, regularization_param, numIter, metrics.meanAveragePrecision, metrics.precisionAt(k), metrics.ndcgAt(k)))
    #print "Rank: " + str(rank)
    #print "Lambda: " + str(regularization_param)
    #print "Iteration: " + str(numIter)
    #print "Mean Average Precision: " + str(metrics.meanAveragePrecision)


In [18]:
result = pd.DataFrame(result_list, columns=['rank','lambda','iteration','map','precision_k','ndcg_k'])

In [17]:
result_list

[(8, 0.1, 10, 0.0008288677125334717, 0.00515624073525052, 0.02057194518240263),
 (12,
  0.1,
  10,
  0.0010158785493724215,
  0.00558790394307738,
  0.02325952648639946)]

In [20]:
result.to_csv('./data/cross_val_result.csv', index=False)

In [23]:
import csv

with open('./data/test2.csv', "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerows(result_list)