In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--packages com.databricks:spark-csv_2.10:1.2.0 pyspark-shell'
spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, spark_home + "/python")
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.6 (default, Oct 26 2016 20:30:19)
SparkSession available as 'spark'.


In [21]:
import pandas as pd
import numpy as np

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

## Data Load

In [4]:
train = spark.read.csv("/labs/lab10data/lab10_train.csv", header=True, schema=StructType(
                            [StructField("user_id", IntegerType(), True),
                             StructField("item_id", IntegerType(), True),
                             StructField("purchase", DoubleType(), True)]))

In [6]:
train.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|     0.0|
|   1654|  89249|     0.0|
|   1654|  99982|     0.0|
|   1654|  89901|     0.0|
|   1654| 100504|     0.0|
+-------+-------+--------+
only showing top 5 rows



In [5]:
true = spark.read.csv("lab10_true.csv", header=True, schema=StructType(
                            [StructField("user_id", IntegerType(), True),
                             StructField("item_id", IntegerType(), True),
                             StructField("purchase", DoubleType(), True)]))

In [15]:
true.show()

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  94814|     0.0|
|   1654|  93629|     0.0|
|   1654|   9980|     0.0|
|   1654|  95099|     0.0|
|   1654|  11265|     0.0|
|   1654|  88896|     0.0|
|   1654|  67740|     0.0|
|   1654|  74271|     0.0|
|   1654|  99871|     0.0|
|   1654|  78570|     0.0|
|   1654|  71942|     0.0|
|   1654|  74367|     0.0|
|   1654|  98628|     0.0|
|   1654|  95887|     0.0|
|   1654|  77795|     0.0|
|   1654|  75152|     0.0|
|   1654|  74905|     0.0|
|   1654|   9068|     0.0|
|   1654|  72954|     0.0|
|   1654| 102431|     0.0|
+-------+-------+--------+
only showing top 20 rows



In [18]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.mllib.evaluation import BinaryClassificationMetrics

train_rdd = sc.textFile("/labs/lab10data/lab10_train.csv")
header = train_rdd.first()
train_rdd = train_rdd.filter(lambda x: x != header)
train_rdd = train_rdd.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

def normalize_predictions(pred, coef=0.0007):
    if pred < coef:
        return 0.0
    return pred

In [7]:
test_rdd = sc.textFile("lab10_true.csv")
header = test_rdd.first()
test_rdd = test_rdd.filter(lambda x: x != header)
test_rdd = test_rdd.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
test_rdd.take(5)

[Rating(user=1654, product=94814, rating=0.0),
 Rating(user=1654, product=93629, rating=0.0),
 Rating(user=1654, product=9980, rating=0.0),
 Rating(user=1654, product=95099, rating=0.0),
 Rating(user=1654, product=11265, rating=0.0)]

In [8]:
unlabeled_test_rdd = test_rdd.map(lambda x: (x[0], x[1])) # holdout

## Train-Validation Split

In [41]:
tr_rdd0, labeled_val_rdd0 = train_rdd.randomSplit([95, 5], seed=100) # seed matters!
tr_rdd1, labeled_val_rdd1 = train_rdd.randomSplit([95, 5], seed=1000) # seed matters!
tr_rdd2, labeled_val_rdd2 = train_rdd.randomSplit([95, 5], seed=2)


unlabeled_val_rdd0 = labeled_val_rdd0.map(lambda x: (x[0], x[1])) # holdout
unlabeled_val_rdd1 = labeled_val_rdd1.map(lambda x: (x[0], x[1])) # holdout
unlabeled_val_rdd2 = labeled_val_rdd2.map(lambda x: (x[0], x[1])) # holdout

In [10]:
train = tr_rdd.toDF(schema=StructType([StructField("user_id", IntegerType(), True),
                           StructField("item_id", IntegerType(), True),
                           StructField("purchase", DoubleType(), True)]))

In [11]:
valid = labeled_val_rdd.toDF(schema=StructType([StructField("user_id", IntegerType(), True),
                           StructField("item_id", IntegerType(), True),
                           StructField("purchase", DoubleType(), True)]))

In [38]:
train.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|     0.0|
|   1654|  89249|     0.0|
|   1654|  99982|     0.0|
|   1654|  89901|     0.0|
|   1654| 100504|     0.0|
+-------+-------+--------+
only showing top 5 rows



In [41]:
train.count()

4528926

In [39]:
valid.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  66187|     0.0|
|   1654|  84350|     0.0|
|   1654|  83584|     0.0|
|   1654| 100715|     0.0|
|   1654|  74660|     0.0|
+-------+-------+--------+
only showing top 5 rows



## ALS RDD

### Grid Search before training looked like this:

```python
%%time
seed = 2L
iterations = [10,15,25]
regularization_parameters = [0.01, 0.1, 0.5]
ranks = [3,5,8,10]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for it in iterations:
    for rank in ranks:
        for reg in regularization_parameters:
            model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=it, lambda_=reg)
            predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
            rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)

            pred_labels = rates_and_preds.map(lambda r: (normalize_predictions(r[1][1]),r[1][0]))
            metrics = BinaryClassificationMetrics(pred_labels)

            auc = metrics.areaUnderROC

            print 'Iter: {}, rank: {}, lambda: {} --- ROC AUC: {}'.format(it, rank, reg, auc)

            error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

            print 'For rank %s the RMSE is %s' % (rank, error)
            print ''
```   
         
#### Output:

```
Iter: 10, rank: 3, lambda: 0.01 --- ROC AUC: 0.731639934709
For rank 3 the RMSE is 0.0460338345603

Iter: 10, rank: 3, lambda: 0.1 --- ROC AUC: 0.734514795168
For rank 3 the RMSE is 0.0460816121302

Iter: 10, rank: 3, lambda: 0.5 --- ROC AUC: 0.777328607903
For rank 3 the RMSE is 0.0464224929801

Iter: 10, rank: 5, lambda: 0.01 --- ROC AUC: 0.729200432866
For rank 5 the RMSE is 0.0460663663214

Iter: 10, rank: 5, lambda: 0.1 --- ROC AUC: 0.737563830091
For rank 5 the RMSE is 0.0460465162737

Iter: 10, rank: 5, lambda: 0.5 --- ROC AUC: 0.782990899661
For rank 5 the RMSE is 0.0463852176211

Iter: 10, rank: 8, lambda: 0.01 --- ROC AUC: 0.729080948902
For rank 8 the RMSE is 0.0461674935539

Iter: 10, rank: 8, lambda: 0.1 --- ROC AUC: 0.74310150625
For rank 8 the RMSE is 0.0460820215917

Iter: 10, rank: 8, lambda: 0.5 --- ROC AUC: 0.777392459025
For rank 8 the RMSE is 0.0463953593337

Iter: 10, rank: 10, lambda: 0.01 --- ROC AUC: 0.727449790168
For rank 10 the RMSE is 0.0462061898834

Iter: 10, rank: 10, lambda: 0.1 --- ROC AUC: 0.745124098603
For rank 10 the RMSE is 0.0460618873368

Iter: 10, rank: 10, lambda: 0.5 --- ROC AUC: 0.773666927634
For rank 10 the RMSE is 0.0463838665147

Iter: 15, rank: 3, lambda: 0.01 --- ROC AUC: 0.733625236548
For rank 3 the RMSE is 0.0460081756296

Iter: 15, rank: 3, lambda: 0.1 --- ROC AUC: 0.741792649635
For rank 3 the RMSE is 0.0460386549206

Iter: 15, rank: 3, lambda: 0.5 --- ROC AUC: 0.776499441223
For rank 3 the RMSE is 0.0464180265494

Iter: 15, rank: 5, lambda: 0.01 --- ROC AUC: 0.72852852113
For rank 5 the RMSE is 0.046075054415

Iter: 15, rank: 5, lambda: 0.1 --- ROC AUC: 0.741907337317
For rank 5 the RMSE is 0.0460478751819

Iter: 15, rank: 5, lambda: 0.5 --- ROC AUC: 0.776469790115
For rank 5 the RMSE is 0.0463898646245

Iter: 15, rank: 8, lambda: 0.01 --- ROC AUC: 0.727741522873
For rank 8 the RMSE is 0.0461247319256

Iter: 15, rank: 8, lambda: 0.1 --- ROC AUC: 0.748216767839
For rank 8 the RMSE is 0.046062139977

Iter: 15, rank: 8, lambda: 0.5 --- ROC AUC: 0.776448318741
For rank 8 the RMSE is 0.0463956123179

Iter: 15, rank: 10, lambda: 0.01 --- ROC AUC: 0.719892506267
For rank 10 the RMSE is 0.0462075234346

Iter: 15, rank: 10, lambda: 0.1 --- ROC AUC: 0.739951904031
For rank 10 the RMSE is 0.0460849725144

Iter: 15, rank: 10, lambda: 0.5 --- ROC AUC: 0.774371417949
For rank 10 the RMSE is 0.0463880383276

Iter: 25, rank: 3, lambda: 0.01 --- ROC AUC: 0.735054175047
For rank 3 the RMSE is 0.0460087478259

Iter: 25, rank: 3, lambda: 0.1 --- ROC AUC: 0.743852812528
For rank 3 the RMSE is 0.0460401572356

Iter: 25, rank: 3, lambda: 0.5 --- ROC AUC: 0.765828539125
For rank 3 the RMSE is 0.0464590431011

Iter: 25, rank: 5, lambda: 0.01 --- ROC AUC: 0.73399945951
For rank 5 the RMSE is 0.0460800511798

Iter: 25, rank: 5, lambda: 0.1 --- ROC AUC: 0.744554115978
For rank 5 the RMSE is 0.0460531775825

Iter: 25, rank: 5, lambda: 0.5 --- ROC AUC: 0.776661188388
For rank 5 the RMSE is 0.0464027118638

Iter: 25, rank: 8, lambda: 0.01 --- ROC AUC: 0.732513689155
For rank 8 the RMSE is 0.0460869728764

Iter: 25, rank: 8, lambda: 0.1 --- ROC AUC: 0.756796014108
For rank 8 the RMSE is 0.04601012765

Iter: 25, rank: 8, lambda: 0.5 --- ROC AUC: 0.776545226641
For rank 8 the RMSE is 0.04639385306

Iter: 25, rank: 10, lambda: 0.01 --- ROC AUC: 0.714334654793
For rank 10 the RMSE is 0.0461831036839

Iter: 25, rank: 10, lambda: 0.1 --- ROC AUC: 0.746096587359
For rank 10 the RMSE is 0.0460914581093

Iter: 25, rank: 10, lambda: 0.5 --- ROC AUC: 0.775843093541
For rank 10 the RMSE is 0.0463948288111

CPU times: user 7.14 s, sys: 2.55 s, total: 9.69 s
Wall time: 1h 36min 19s
```

In [101]:
# Training the best model

seed = [10,100,1000]
iterations = 10
regularization_parameter = 0.5
rank = 5

als_train0, als_train1, als_train2 = [ALS.trainImplicit(tr_rdd, rank, seed=s, iterations=iterations,
                                                       lambda_=regularization_parameter) \
                                      for s, tr_rdd in zip(seed, [tr_rdd0, tr_rdd1, tr_rdd2])]

for i in range(3):
    als_train = globals()['als_train{}'.format(str(i))]
    labeled_val_rdd = globals()['labeled_val_rdd{}'.format(str(i))]
    unlabeled_val_rdd = globals()['unlabeled_val_rdd{}'.format(str(i))]

    predictions_val = als_train.predictAll(unlabeled_val_rdd).map(lambda r: ((r[0], r[1]), r[2]))

    labels_predictions_val = labeled_val_rdd.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions_val)

    pred_labels = labels_predictions_val.map(lambda r: (r[1][1],r[1][0]))

    metrics = BinaryClassificationMetrics(pred_labels)

    auc = metrics.areaUnderROC

    print 'ALS #: {}, Iter: {}, rank: {}, lambda: {} --- ROC AUC: {}' \
          .format(i, iterations, rank, regularization_parameter, auc)

ALS #: 0, Iter: 10, rank: 5, lambda: 0.5 --- ROC AUC: 0.831064888722
ALS #: 1, Iter: 10, rank: 5, lambda: 0.5 --- ROC AUC: 0.825141751005
ALS #: 2, Iter: 10, rank: 5, lambda: 0.5 --- ROC AUC: 0.805082927758


In [104]:
# Final test: ALS #0

predictions_test = als_train0.predictAll(unlabeled_test_rdd).map(lambda r: ((r[0], r[1]), r[2]))

labels_predictions_test = test_rdd.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions_test)

pred_labels_test = labels_predictions_test.map(lambda r: (r[1][1],r[1][0]))

metrics = BinaryClassificationMetrics(pred_labels_test)

auc_test = metrics.areaUnderROC

print 'Iter: {}, rank: {}, lambda: {} --- ROC AUC: {}'.format(iterations, rank, regularization_parameter, auc_test)

Iter: 10, rank: 5, lambda: 0.5 --- ROC AUC: 0.823397319514


In [102]:
# Final test: ALS #1

predictions_test = als_train1.predictAll(unlabeled_test_rdd).map(lambda r: ((r[0], r[1]), r[2]))

labels_predictions_test = test_rdd.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions_test)

pred_labels_test = labels_predictions_test.map(lambda r: (r[1][1],r[1][0]))

metrics = BinaryClassificationMetrics(pred_labels_test)

auc_test = metrics.areaUnderROC

print 'Iter: {}, rank: {}, lambda: {} --- ROC AUC: {}'.format(iterations, rank, regularization_parameter, auc_test)

Iter: 10, rank: 5, lambda: 0.5 --- ROC AUC: 0.814314117531


In [103]:
# Final test: ALS #2

predictions_test = als_train2.predictAll(unlabeled_test_rdd).map(lambda r: ((r[0], r[1]), r[2]))

labels_predictions_test = test_rdd.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions_test)

pred_labels_test = labels_predictions_test.map(lambda r: (r[1][1],r[1][0]))

metrics = BinaryClassificationMetrics(pred_labels_test)

auc_test = metrics.areaUnderROC

print 'Iter: {}, rank: {}, lambda: {} --- ROC AUC: {}'.format(iterations, rank, regularization_parameter, auc_test)

Iter: 10, rank: 5, lambda: 0.5 --- ROC AUC: 0.80361851881


In [105]:
# Bagging

preds = unlabeled_test_rdd.map(lambda r: ((r[0], r[1]), 0.0))

for i in range(3):
    als_train = globals()['als_train{}'.format(str(i))]
    
    preds = preds.join(als_train.predictAll(unlabeled_test_rdd) \
                 .map(lambda r: ((r[0], r[1]), r[2])))
    
    preds = preds.map(lambda r: (r[0], float(r[1][0]) + float(r[1][1])))

In [106]:
labels_predictions_test = test_rdd.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(preds)

pred_labels_test = labels_predictions_test.map(lambda r: (r[1][1],r[1][0]))

metrics = BinaryClassificationMetrics(pred_labels_test)

auc_test = metrics.areaUnderROC

print 'Iter: {}, rank: {}, lambda: {} --- ROC AUC: {}'.format(iterations, rank, regularization_parameter, auc_test)

Iter: 10, rank: 5, lambda: 0.5 --- ROC AUC: 0.820694695724


In [None]:
# Leaderboard overfitting

In [107]:
preds.cache()

thresholds = np.linspace(0.0001, 0.001, 10, dtype='float32')
for th in thresholds:
    # preds 
    labels_predictions_test = test_rdd.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(preds)
    pred_labels_test = labels_predictions_test.map(lambda r: (normalize_predictions(r[1][1], coef=th), r[1][0]))
    metrics = BinaryClassificationMetrics(pred_labels_test)
    auc_test = metrics.areaUnderROC
    print 'Th: {} --- ROC AUC: {}'.format(th, auc_test)

Th: 9.99999974738e-05 --- ROC AUC: 0.802632181711
Th: 0.000199999994948 --- ROC AUC: 0.804973112584
Th: 0.000300000014249 --- ROC AUC: 0.806815220176
Th: 0.000399999989895 --- ROC AUC: 0.808580293232
Th: 0.000500000023749 --- ROC AUC: 0.810006263879
Th: 0.000600000028498 --- ROC AUC: 0.81084930304
Th: 0.000699999975041 --- ROC AUC: 0.811466458439
Th: 0.00079999997979 --- ROC AUC: 0.812480345742
Th: 0.00089999998454 --- ROC AUC: 0.813469812561
Th: 0.0010000000475 --- ROC AUC: 0.813855603908


In [108]:
thresholds = np.linspace(0.001, 0.005, 10, dtype='float32')
for th in thresholds:
    pred_labels_test = labels_predictions_test.map(lambda r: (normalize_predictions(r[1][1], coef=th), r[1][0]))
    metrics = BinaryClassificationMetrics(pred_labels_test)
    auc_test = metrics.areaUnderROC
    print 'Th: {} --- ROC AUC: {}'.format(th, auc_test)

Th: 0.0010000000475 --- ROC AUC: 0.813855603908
Th: 0.00144444440957 --- ROC AUC: 0.815476906776
Th: 0.00188888888806 --- ROC AUC: 0.816979569086
Th: 0.00233333325014 --- ROC AUC: 0.817385465429
Th: 0.00277777784504 --- ROC AUC: 0.817350554678
Th: 0.00322222220711 --- ROC AUC: 0.8169363357
Th: 0.00366666656919 --- ROC AUC: 0.816029337946
Th: 0.00411111116409 --- ROC AUC: 0.815498556857
Th: 0.00455555552617 --- ROC AUC: 0.813430678387
Th: 0.00499999988824 --- ROC AUC: 0.812065229264


## Gradient Boosting on other features

In [110]:
# Stage 1. Processing items dataset into items_genres and items_year

from pyspark.ml.feature import CountVectorizer, RegexTokenizer, HashingTF, VectorAssembler
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
from pyspark.ml.linalg import VectorUDT
from pyspark.sql.functions import udf
from pyspark.ml.linalg import SparseVector

year_as_int = udf(lambda v: int(v) if v is not None else None, IntegerType())

items = spark.read.csv("/labs/lab10data/lab10_items.csv", header=True, sep='\t')
items_genres_years = items.select('item_id', 'genres', 'year')
items_genres_years = items_genres_years.na.fill({'genres': u'_'})
items_genres_years = items_genres_years.na.fill({'year': u'1899'})
items_genres_years = items_genres_years.withColumn('year', items_genres_years.year.cast(IntegerType()))

tokenizer = RegexTokenizer(inputCol="genres", outputCol="genre_tokens", gaps=False, pattern=ur"[_А-Яа-яёЁ]+", toLowercase=False)

items_genres_years_tk = tokenizer.transform(items_genres_years)

In [111]:
cv = CountVectorizer(inputCol="genre_tokens", outputCol="genre_vector")
cv_model = cv.fit(items_genres_years_tk)
items_features_vec = cv_model.transform(items_genres_years_tk)

In [112]:
items_features_vec.show(5)

+-------+-------+----+------------+---------------+
|item_id| genres|year|genre_tokens|   genre_vector|
+-------+-------+----+------------+---------------+
|  65667|Эротика|2013|   [Эротика]|(96,[22],[1.0])|
|  65669|Эротика|2011|   [Эротика]|(96,[22],[1.0])|
|  65668|Эротика|2011|   [Эротика]|(96,[22],[1.0])|
|  65671|Эротика|2011|   [Эротика]|(96,[22],[1.0])|
|  65670|Эротика|2010|   [Эротика]|(96,[22],[1.0])|
+-------+-------+----+------------+---------------+
only showing top 5 rows



In [113]:
# History vector
# First we reduce our dataset to 1000 most popular films
from pyspark.sql.functions import monotonically_increasing_id

items_count = train.groupBy('item_id').count().withColumnRenamed('count', 'item_count')
items_desc_count = items_count.orderBy(items_count.item_count.desc()).limit(1000) \
                              .withColumn("item_row_id", monotonically_increasing_id())

items_desc_count.cache()

DataFrame[item_id: int, item_count: bigint, item_row_id: bigint]

In [121]:
items_desc_count.count()

1000

In [150]:
items_desc_count.show(5)

+-------+----------+-----------+
|item_id|item_count|item_row_id|
+-------+----------+-----------+
|  98971|      1351|          0|
|  71740|      1349|          1|
|   5117|      1348|          2|
|   7618|      1348|          3|
|  98069|      1348|          4|
+-------+----------+-----------+
only showing top 5 rows



In [114]:
train_truncated = train.join(items_desc_count, 'item_id', 'inner').select('user_id', 'item_row_id', 'purchase')
train_truncated.cache()

DataFrame[user_id: int, item_row_id: bigint, purchase: double]

In [115]:
valid_truncated = valid.join(items_desc_count, 'item_id', 'inner').select('user_id', 'item_id', 'purchase')
train_truncated.cache()

DataFrame[user_id: int, item_row_id: bigint, purchase: double]

In [116]:
# Now we build up a history vector

as_ml = udf(lambda v: v.asML() if v is not None else None, VectorUDT())

from_ml = udf(lambda v: Vectors.fromML(v) if v is not None else None)

train_matrix = train_truncated.rdd.map(lambda r: MatrixEntry(r[0], r[1], r[2]))

train_matrix = CoordinateMatrix(train_matrix)

train_row_mat_i = train_matrix.toIndexedRowMatrix()

train_mat_df = train_row_mat_i.rows.toDF().withColumnRenamed('index', 'user_id') \
                                    .withColumnRenamed('vector', 'history_vec') \
                                    .withColumn("history_vec", as_ml("history_vec"))    

In [117]:
train_mat_df.show(10)

+-------+--------------------+
|user_id|         history_vec|
+-------+--------------------+
| 922400|(1000,[1,2,3,5,7,...|
| 940600|(1000,[0,1,2,3,5,...|
| 866400|(1000,[0,1,3,4,5,...|
| 857000|(1000,[1,2,4,5,7,...|
| 899200|(1000,[0,1,3,4,5,...|
| 879401|(1000,[0,1,2,3,5,...|
| 749801|(1000,[0,1,5,6,7,...|
| 792601|(1000,[0,1,2,3,5,...|
| 905401|(1000,[0,2,4,5,7,...|
| 905201|(1000,[1,3,4,5,6,...|
+-------+--------------------+
only showing top 10 rows



In [118]:
# Joining everything together

train_joined = train.join(train_mat_df, 'user_id', 'left') \
                    .join(items_features_vec, 'item_id', 'left') \
                    .select('user_id', 'item_id', 'history_vec', 'genre_vector', 'year', 'purchase')
        
valid_joined = valid.join(train_mat_df, 'user_id', 'left') \
                    .join(items_features_vec, 'item_id', 'left') \
                    .select('user_id', 'item_id', 'history_vec', 'genre_vector', 'year', 'purchase')

test_joined = true.join(train_mat_df, 'user_id', 'left') \
                    .join(items_features_vec, 'item_id', 'left') \
                    .select('user_id', 'item_id', 'history_vec', 'genre_vector', 'year', 'purchase')
        

train_joined.cache()

DataFrame[user_id: int, item_id: int, history_vec: vector, genre_vector: vector, year: int, purchase: double]

In [119]:
# Building a single vector

vec_assembler = VectorAssembler(inputCols=["history_vec", "genre_vector", 'year'], outputCol="features")
# vec_assembler = VectorAssembler(inputCols=["history_vec", "genre_vector"], outputCol="features")
train_joined = vec_assembler.transform(train_joined)
valid_joined = vec_assembler.transform(valid_joined)
test_joined = vec_assembler.transform(test_joined)

In [120]:
train_fm_dataset = train_joined.select('purchase', 'features') \
                               .withColumnRenamed('purchase', 'label')
valid_fm_dataset = valid_joined.select('purchase', 'features') \
                               .withColumnRenamed('purchase', 'label')
    
test_fm_dataset = test_joined.select('purchase', 'features') \
                               .withColumnRenamed('purchase', 'label')

In [None]:
train_fm_dataset.where(train_fm_dataset.features.isNull()).count()

0

In [None]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


best_gbt = GBTRegressor(maxMemoryInMB=10240, 
                        maxIter=50,
                        subsamplingRate=1.0,
                        maxDepth=9,
                        featuresCol="features",
                        labelCol="label",
                        predictionCol="rawPrediction")
fitted_gbt = best_gbt.fit(train_fm_dataset)

In [None]:
transformed = fitted_gbt.transform(valid_joined)

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="purchase")
score = evaluator.evaluate(transformed)
score

0.8422141757567635

In [None]:
## subsampling: 1.0; maxDepth: 5
# 10 iter: 0.7671101003103943
# 15 iter: 0.7810317725957958
# 30 iter: 0.7999755632153878

## subsampling: 1.0; maxDepth: 6
# 30 iter: 0.811694625573447

## subsampling: 1.0; maxDepth: 7
# 30 iter: 0.8278831951589867

## subsampling: 1.0; maxDepth: 8
# 30 iter: 0.822975757925702
# 40 iter: 0.836309864699397
# 50 iter: 0.8366130295710614

## subsampling: 1.0; maxDepth: 9
# 50 iter: 0.8425874387927447
# 60 iter: 0.8400094072324349 (year as float)

## subsampling: 1.0; maxDepth: 10
# 60 iter: 0.8345177147366496

In [127]:
postprocess = udf(lambda v: normalize_predictions(v), DoubleType())

In [125]:
transformed_test = fitted_gbt.transform(test_joined)

In [128]:
transformed_test = transformed_test.withColumn('rawPrediction', postprocess(transformed_test['rawPrediction'])) \
                                   .withColumnRenamed('purchase', 'label')

In [129]:
transformed_test.show()

+-------+-------+--------------------+--------------------+----+-----+--------------------+--------------------+
|user_id|item_id|         history_vec|        genre_vector|year|label|            features|       rawPrediction|
+-------+-------+--------------------+--------------------+----+-----+--------------------+--------------------+
| 782482|  10129|(1000,[0,1,2,3,4,...|(96,[0,1,4,39],[1...|2013|  0.0|(1097,[22,457,508...|0.002464763166527495|
| 892108|  10129|(1000,[0,2,3,4,5,...|(96,[0,1,4,39],[1...|2013|  0.0|(1097,[187,201,22...|0.013839999679402254|
| 892140|  10129|(1000,[0,4,5,6,7,...|(96,[0,1,4,39],[1...|2013|  0.0|(1097,[1000,1001,...|7.037651117619899E-4|
| 879985|  10129|(1000,[0,4,6,9,12...|(96,[0,1,4,39],[1...|2013|  0.0|(1097,[715,780,10...|7.037651117619899E-4|
| 905741|  10129|(1000,[1,3,4,5,6,...|(96,[0,1,4,39],[1...|2013|  0.0|(1097,[339,701,77...|  0.0080496050143013|
| 927035|  10129|(1000,[0,1,2,3,5,...|(96,[0,1,4,39],[1...|2013|  0.0|(1097,[1000,1001,...|7.037

In [130]:
evaluator = BinaryClassificationEvaluator()
score2 = evaluator.evaluate(transformed_test)
score2

0.8471244793902535

In [None]:
## subsampling: 1.0; maxDepth: 8
# 30 iter: 0.8249368397566905 -- better than on validation!
# 40 iter: 0.838257894132567

## subsampling: 1.0; maxDepth: 9
# 50 iter: 0.8460460801596608
# 60 iter: 0.8427943930567464 (year as float)

In [334]:
transformed_test.select('user_id', 'item_id', 'rawPrediction') \
                .withColumnRenamed('rawPrediction', 'purchase') \
                .orderBy(['user_id', 'item_id']) \
                .coalesce(1) \
                .write \
                .format("com.databricks.spark.csv") \
                .option("header", "true") \
                .save('lab10s.csv')

## LibFM preparation and training

In [163]:
train_fm_dataset.count()

4780428

In [169]:
train_fm_dataset.cache()

DataFrame[label: double, features: vector]

In [375]:
LIBFM_PATH = '/data/home/anton.pilipenko/libfm-1.42.src/bin/libFM'

In [407]:
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors


data_fm_tr = train_fm_dataset.rdd.map(lambda r: LabeledPoint(r[0],Vectors.fromML(r[1])))
data_fm_val = valid_fm_dataset.rdd.map(lambda r: LabeledPoint(r[0],Vectors.fromML(r[1])))

MLUtils.saveAsLibSVMFile(data_fm_tr, "train_fm.data")
MLUtils.saveAsLibSVMFile(data_fm_val, "valid_fm.data")

In [256]:
data_fm_test = test_fm_dataset.rdd.map(lambda r: LabeledPoint(r[0],Vectors.fromML(r[1]))).coalesce(1)
MLUtils.saveAsLibSVMFile(data_fm_test, "test_fm.data")

In [412]:
data_fm_tr.take(15)

[LabeledPoint(0.0, (597,[500,501,504,539,596],[1.0,1.0,1.0,1.0,0.998016856718])),
 LabeledPoint(0.0, (597,[500,501,504,539,596],[1.0,1.0,1.0,1.0,0.998016856718])),
 LabeledPoint(0.0, (597,[500,501,504,539,596],[1.0,1.0,1.0,1.0,0.998016856718])),
 LabeledPoint(0.0, (597,[500,501,504,539,596],[1.0,1.0,1.0,1.0,0.998016856718])),
 LabeledPoint(0.0, (597,[500,501,504,539,596],[1.0,1.0,1.0,1.0,0.998016856718])),
 LabeledPoint(0.0, (597,[500,501,504,539,596],[1.0,1.0,1.0,1.0,0.998016856718])),
 LabeledPoint(0.0, (597,[500,501,504,539,596],[1.0,1.0,1.0,1.0,0.998016856718])),
 LabeledPoint(0.0, (597,[500,501,504,539,596],[1.0,1.0,1.0,1.0,0.998016856718])),
 LabeledPoint(0.0, (597,[500,501,504,539,596],[1.0,1.0,1.0,1.0,0.998016856718])),
 LabeledPoint(0.0, (597,[213,500,501,504,539,596],[1.0,1.0,1.0,1.0,1.0,0.998016856718])),
 LabeledPoint(0.0, (597,[500,501,504,539,596],[1.0,1.0,1.0,1.0,0.998016856718])),
 LabeledPoint(0.0, (597,[500,501,504,539,596],[1.0,1.0,1.0,1.0,0.998016856718])),
 Labeled

In [409]:
!$LIBFM_PATH -task c -train train_fm.data -test valid_fm.data -iter 20 -dim '1,1,8' -out output.libfm

----------------------------------------------------------------------------
libFM
  Version: 1.4.2
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=4780428	num_values=24695766	num_features=598	min_target=0	max_target=1
Loading test... 	
has x = 0
has xt = 1
num_rows=252196	num_values=1302963	num_features=598	min_target=0	max_target=1
#relations: 0
Loading meta data...	
#Iter=  0	Train=0.997829	Test=0.997831	Test(ll)=0.0784925
#Iter=  1	Train=0.997833	Test=0.997835	Test(ll)=0.0632365
#Iter=  2	Train=0.997833	Test=0.997835	Test(ll)=0.0534096
#Iter=  3	Train=0.997833	Test=0.997839	Test(ll)=0.0465388
#Iter=  4	Train=0.997833	Test=0.997839	Test(ll)

In [378]:
!ls

hs_err_pid3747.log  lab10_solution.ipynb  lab10_true.csv
Lab10.ipynb	    Lab10_solution.ipynb  lab10_views_programmes.csv
lab10_items.csv     lab10_test.csv	  output.libfm
lab10_sol.ipynb     lab10_train.csv	  spark-FM-parallelSGD


LibFM training.


anton.pilipenko@master:~/libfm-1.42.src/bin$ ./libFM -task r -train train_fm.data -test valid_fm.data -dim '1,1,5'
#Iter= 99	Train=0.404857	Test=0.410148


anton.pilipenko@master:~/libfm-1.42.src/bin$ ./libFM -task r -train train_fm.data -test valid_fm.data -dim '1,1,8'
#Iter= 99	Train=0.411104	Test=0.409075



In [386]:
val_preds = sc.textFile('output.libfm')

In [388]:
import numpy as np

In [403]:
l = np.loadtxt('labels.txt')

In [400]:
p = np.loadtxt('output.libfm')

In [405]:
l[:10]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [395]:
np.vstack([l,p]).T[:2]

array([[ 0.      ,  0.41474 ],
       [ 0.      ,  0.464436]])