In [9]:
import pandas as pd


df_interactions = pd.read_csv(
    '../docs/coolbet/data/prod/interactions.csv',
    usecols=['id', 'user_id', 'match_id', 'created_at'],
    index_col=0,
    parse_dates=['created_at']
)
df_interactions = df_interactions[df_interactions['created_at'] > '2022-11-01']
df_interactions.sort_values(by=['created_at'])
df_interactions['rating'] = 1
df_interactions.drop(columns=['created_at'], inplace=True)
df_interactions.head()

Unnamed: 0_level_0,user_id,match_id,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
daa0ddc3-eaaf-44aa-9977-7266d2cbbd35,65033,2336348,1
c7823ad9-26aa-4d41-b671-dfd5151ecdbf,47630,2333998,1
ac15efdf-0d1d-472f-8d0b-74026c2885f0,63120,2018954,1
53d438c9-62f0-4984-a3ac-a7cc953e4026,52358,2329228,1
4496ddf6-d776-4a93-905c-09a8ea631f04,52358,2018954,1


In [None]:
df_train = df_interactions[:int(len(df_interactions)*0.8)]
df_test = df_interactions[int(len(df_interactions)*0.8):]

df_train.head()

Unnamed: 0_level_0,user_id,match_id,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
daa0ddc3-eaaf-44aa-9977-7266d2cbbd35,65033,2336348,1
c7823ad9-26aa-4d41-b671-dfd5151ecdbf,47630,2333998,1
ac15efdf-0d1d-472f-8d0b-74026c2885f0,63120,2018954,1
53d438c9-62f0-4984-a3ac-a7cc953e4026,52358,2329228,1
4496ddf6-d776-4a93-905c-09a8ea631f04,52358,2018954,1


: 

: 

In [None]:
df_test.head()

Unnamed: 0_level_0,user_id,match_id,stake_eur,created_at,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
77d2443e-9c9e-451f-9188-27e6a5ff7a30,h1zDX7Tt,309359,0.33,2022-10-10 23:50:39,1
7794bf4b-107e-430c-a54a-d52d318edbc1,h1zDX7Tt,309370,0.33,2022-10-10 23:50:39,1
547b80c1-5728-46eb-b993-f12d5845ca2c,h1zDX7Tt,309379,0.33,2022-10-10 23:50:39,1
7dc9c86f-079a-432b-a051-f549c308cb9e,h1zDX7Tt,309366,0.33,2022-10-10 23:50:39,1
d010f3cd-62b2-4d31-bbdf-7b8567868d17,h1zDX7Tt,309370,0.33,2022-10-10 23:50:39,1


: 

In [8]:
from surprise.dataset import Reader, Dataset, DatasetUserFolds

reader = Reader(rating_scale=(0, 1))

trainset = Dataset.load_from_df(
    df_train,
    reader=reader
)
trainset = trainset.build_full_trainset()

testset = Dataset.load_from_df(
    df_test,
    reader=reader
)
testset = testset.build_full_trainset()
testset = testset.build_testset()

len(df_interactions), trainset.n_ratings, len(testset)

ValueError: too many values to unpack (expected 3)

In [17]:
from surprise.prediction_algorithms.matrix_factorization import SVD

model = SVD(n_factors=50)
model.fit(trainset)

pred = model.test(testset)
pred[:5]

[Prediction(uid=47536, iid=2305521, r_ui=1.0, est=0.9991856715889266, details={'was_impossible': False}),
 Prediction(uid=47536, iid=2327789, r_ui=1.0, est=0.9997085183510497, details={'was_impossible': False}),
 Prediction(uid=47536, iid=2333365, r_ui=1.0, est=0.991278450650175, details={'was_impossible': False}),
 Prediction(uid=30740, iid=2305143, r_ui=1.0, est=1, details={'was_impossible': False}),
 Prediction(uid=30740, iid=2305520, r_ui=1.0, est=1, details={'was_impossible': False})]

## Hit Rate

In [18]:
def hit_rate(y, y_hat, L = 10):
    hit = 0
    total = 0

    for user_id, item_id, _ in y:
        user_pred = [x for x in y_hat if x.uid == user_id]
        user_pred = sorted(user_pred, key=lambda x: x.est, reverse=True)[:L]
        
        total += 1
        if item_id in [x.iid for x in user_pred]:
            hit += 1
    return hit  / total


hit_rate(testset, pred)

0.6521124865446717

## MRR / ARHR

In [19]:
def mean_repricoral_rank(y, y_hat, L = 10):
    result = 0

    user_ids = set([x[0] for x in y])
    for user_id in user_ids:
        user_pred = [x for x in y_hat if x.uid == user_id]
        user_pred = sorted(user_pred, key=lambda x: x.est, reverse=True)[:L]
        
        user_test_iids = {x[1] for x in y if x[0] == user_id}
        
        repricoral_rank = 0
        for rank, item in enumerate(user_pred):
            if item.iid in user_test_iids:
                repricoral_rank = 1 / (rank + 1)
        result += repricoral_rank

    return result / len(user_ids)

mean_repricoral_rank(testset, pred)

0.3351429547918661

## MAP

In [20]:
def mean_average_precision(y, y_hat, L = 10):
    map = 0
    
    user_ids = set([x[0] for x in y])
    for user_id in user_ids:
        user_pred = [x for x in y_hat if x.uid == user_id]
        user_pred = sorted(user_pred, key=lambda x: x.est, reverse=True)[:L]
        
        user_test_iids = {x[1] for x in y if x[0] == user_id}

        ap = 0
        n_relevant = 0
        for i, item in enumerate(user_pred):
            rel_k = 1 if item.iid in user_test_iids else 0
            n_relevant += rel_k
            precision_k = n_relevant / (i + 1)
            ap += precision_k * rel_k
        map += ap
    return map / len(user_ids)

mean_average_precision(testset, pred)

5.500636402206195

In [21]:
def eval_model(y, y_hat):
    print(' | '.join([
        f'Hit Rate: {hit_rate(y, y_hat):.4f}',
        f'MRR: {mean_repricoral_rank(y, y_hat):.4f}',
        f'MAP: {mean_average_precision(y, y_hat):.4f}'
    ]))

In [22]:
from surprise.prediction_algorithms.matrix_factorization import SVD

model = SVD()
model.fit(trainset)
pred = model.test(testset)
eval_model(testset, pred)

Hit Rate: 0.7411 | MRR: 0.3351 | MAP: 5.5006


In [23]:
from surprise.prediction_algorithms.matrix_factorization import SVDpp

# optimized for implicit ratings
model = SVDpp()
model.fit(trainset)
pred = model.test(testset)
eval_model(testset, pred)

Hit Rate: 0.7270 | MRR: 0.3351 | MAP: 5.5006


In [24]:
from surprise.prediction_algorithms.matrix_factorization import NMF

# SVD, but latent factors are forced to be non-negative
model = NMF()
model.fit(trainset)
pred = model.test(testset)
eval_model(testset, pred) # not suitable for implicit feedback

Hit Rate: 0.7514 | MRR: 0.3351 | MAP: 5.5006


In [25]:
from surprise.prediction_algorithms.slope_one import SlopeOne

model = SlopeOne()
model.fit(trainset)
pred = model.test(testset)
eval_model(testset, pred) # test on real data, but likely not scalable and more suitable for explicit feedback

Hit Rate: 0.8269 | MRR: 0.3351 | MAP: 5.5006


## Spark

In [26]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .appName('nrocinu')\
    .getOrCreate()
    
spark_df = spark.createDataFrame(df_interactions)
spark_df.printSchema()
spark_df.show()

22/11/01 19:30:24 WARN Utils: Your hostname, dell resolves to a loopback address: 127.0.1.1; using 192.168.5.173 instead (on interface wlp0s20f3)
22/11/01 19:30:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/01 19:30:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


root
 |-- user_id: long (nullable = true)
 |-- match_id: long (nullable = true)
 |-- rating: long (nullable = true)

+-------+--------+------+
|user_id|match_id|rating|
+-------+--------+------+
|  65033| 2336348|     1|
|  47630| 2333998|     1|
|  63120| 2018954|     1|
|  52358| 2329228|     1|
|  52358| 2018954|     1|
|  10590| 2329828|     1|
|  36422| 2321339|     1|
|  36422| 2321340|     1|
|  36422| 2321341|     1|
|  36422| 2321342|     1|
|  36422| 2321976|     1|
|  36422| 2322630|     1|
|  36422| 2322631|     1|
|  36422| 2323061|     1|
|  36422| 2324424|     1|
|  36422| 2324425|     1|
|  36422| 2324426|     1|
|  39281| 2329813|     1|
|  39281| 2329830|     1|
|  39281| 2018948|     1|
+-------+--------+------+
only showing top 20 rows



                                                                                

In [27]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

indexer = [StringIndexer(inputCol=column, outputCol=f'{column}_index') for column in list(set(spark_df.columns) - set(['rating']))]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(spark_df).transform(spark_df)
transformed.show()

                                                                                

+-------+--------+------+-------------+--------------+
|user_id|match_id|rating|user_id_index|match_id_index|
+-------+--------+------+-------------+--------------+
|  65033| 2336348|     1|      13775.0|         204.0|
|  47630| 2333998|     1|       3031.0|         143.0|
|  63120| 2018954|     1|      15501.0|          20.0|
|  52358| 2329228|     1|         66.0|          29.0|
|  52358| 2018954|     1|         66.0|          20.0|
|  10590| 2329828|     1|       4240.0|         106.0|
|  36422| 2321339|     1|         79.0|          65.0|
|  36422| 2321340|     1|         79.0|          41.0|
|  36422| 2321341|     1|         79.0|          90.0|
|  36422| 2321342|     1|         79.0|          74.0|
|  36422| 2321976|     1|         79.0|         172.0|
|  36422| 2322630|     1|         79.0|         439.0|
|  36422| 2322631|     1|         79.0|         556.0|
|  36422| 2323061|     1|         79.0|         595.0|
|  36422| 2324424|     1|         79.0|         596.0|
|  36422| 

In [28]:
train, test = transformed.randomSplit([0.8, 0.2])

model = ALS(
    maxIter=5,
    regParam=0.09,
    rank=25,
    userCol="user_id_index",
    itemCol="match_id_index",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True
)
model = model.fit(train)


In [29]:
from surprise import Prediction

pred = model.transform(test)
pred = pred.rdd.map(lambda row: Prediction(row['user_id'], row['match_id'], 1, row['prediction'], {})).collect()
testset = test.rdd.map(lambda row: (row['user_id'], row['match_id'], row['rating'])).collect()

                                                                                

In [30]:
eval_model(testset, pred)

Hit Rate: 0.7907 | MRR: 0.5168 | MAP: 3.1808
