# Necessary modules and environment params

In [1]:
%matplotlib inline

In [2]:
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys
sys.path.append("../")

In [5]:
from src.data_pipeline.DataLoader import DataLoader
from src.utility.sys_utils import get_spark

# import your model
from src.model.ALS_MF import ALS_MF
from src.model.KNN import KNN
from src.model.BaseLine import BaseLine

# import training pipeline
from src.data_pipeline.pipeline import cross_validation, test_evaluation

# import result checking tools
from src.utility.Summary import Summary

# Load whole data from the text file

In [48]:
spark = get_spark(cores=4) # change cores up to 6 if needed
data_loader = DataLoader(spark, "ml-20m-5p")

Using sampled subset with 6.896900E+04 records
Using split of range (0, 0.2), test set contains 19365 of 68969 records in total.


In [None]:
raw_data = data_loader.get_raw_data()

# Construct Model

In [52]:
als_params = {
    "rank": 64,
    "maxIter": 15,
    "regParam": 0.05,
    "num_neg": 0
}

In [53]:
als = ALS_MF(als_params)

In [7]:
knn = KNN({"k":20})
knn.get_name()

'KNN'

In [10]:
bl = BaseLine({"model": "count_uni_rating"})

# Cross validation

In [54]:
cross_validation(data_loader, als, spark, k_fold=5, top_k=10)

Using split of range [0.0, 0.2], test set contains 14810 of 49604 records in total.
Creating directory and start writing ...
Rewriting files in /home/ds2019/log/ml-20m-5p/ALS_MF/maxIter_15-num_neg_3-rank_64-regParam_0.05/Rating_fold_0.parquet
Using split of range [0.2, 0.4], test set contains 8402 of 49604 records in total.
Rewriting files in /home/ds2019/log/ml-20m-5p/ALS_MF/maxIter_15-num_neg_3-rank_64-regParam_0.05/Ranking_fold_1.parquet
Rewriting files in /home/ds2019/log/ml-20m-5p/ALS_MF/maxIter_15-num_neg_3-rank_64-regParam_0.05/Rating_fold_1.parquet
Using split of range [0.4, 0.6], test set contains 8407 of 49604 records in total.
Rewriting files in /home/ds2019/log/ml-20m-5p/ALS_MF/maxIter_15-num_neg_3-rank_64-regParam_0.05/Ranking_fold_2.parquet
Rewriting files in /home/ds2019/log/ml-20m-5p/ALS_MF/maxIter_15-num_neg_3-rank_64-regParam_0.05/Rating_fold_2.parquet
Using split of range [0.6, 0.8], test set contains 8402 of 49604 records in total.
Rewriting files in /home/ds2019/lo

defaultdict(list,
            {'ndcg@10': [0.01465069888163267,
              0.00761103020982536,
              0.009206919386208878,
              0.007079687370480849,
              0.013305651880724264],
             'precision@10': [0.0041134458687816485,
              0.0023120511609765345,
              0.0027059623454623993,
              0.0021220195587044913,
              0.003707284298569309]})

# Check evaluation results

In [55]:
summary = Summary(data_loader.get_config().db_path)
summary.summarize_cv("ml-20m-5p", ["ndcg@10"])

Unnamed: 0,model,hyper,metric,mean,std,rnk
1,ALS_MF,"[('maxIter', 15), ('rank', 16), ('regParam', 0...",ndcg@10,0.014002,0.001768,1.0
0,ALS_MF,"[('maxIter', 15), ('num_neg', 3), ('rank', 64)...",ndcg@10,0.009652,0.000994,2.0
2,ALS_MF,"[('maxIter', 15), ('rank', 64), ('regParam', 0...",ndcg@10,0.009602,0.000866,3.0
3,KNN,"[('k', 10)]",ndcg@10,0.004271,0.002636,4.0


# Finally train model using all training data, and evaluate on test data

In [57]:
best_params = {
    "rank": 16,
    "maxIter": 15,
    "regParam": 0.05,
    "num_neg": 0
}

als_final = ALS_MF(best_params)

In [58]:
test_evaluation(data_loader, als_final, spark, top_k=10, force_rewrite=True, oracle_type=None)

Creating directory and start writing ...
Rewriting files in /home/ds2019/log/ml-20m-5p/ALS_MF/maxIter_15-num_neg_0-rank_16-regParam_0.05/Rating_fold_-1.parquet


ChainMap({'ndcg@10': 0.011628949133507396, 'precision@10': 0.00347046207837785}, {})

In [59]:
summary.get_model_test_perf("ml-20m-5p", "ALS_MF")

Unnamed: 0_level_0,model,hyper,metric,value,ts
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
362,ALS_MF,"[('maxIter', 15), ('num_neg', 0), ('rank', 16)...",ndcg@10,0.011629,2019-11-03 23:39:45.681431
363,ALS_MF,"[('maxIter', 15), ('num_neg', 0), ('rank', 16)...",precision@10,0.00347,2019-11-03 23:39:45.681709
