In [2]:
from lib.spark import spark

df = spark.read.csv('../../docs/coolbet/data/prod/interactions.csv', header=True, inferSchema=True)

                                                                                

In [3]:
df.show(n=5)

+--------------------+-------+--------+--------------------+-------------+--------------------+
|                  id|user_id|match_id|              bet_id|bet_stake_eur|          created_at|
+--------------------+-------+--------+--------------------+-------------+--------------------+
|3b0b3985-5b33-4e3...|  26380| 2276004|0bf6249c-22b4-4d2...|         4.16|2022-10-24 03:00:...|
|6310c975-c1c9-415...|  62988| 2298985|08ef26b6-d96c-474...|         8.39|2022-10-24 03:00:...|
|e62b7665-a7d5-441...|  62988| 2298984|08ef26b6-d96c-474...|         8.39|2022-10-24 03:00:...|
|766c2d21-6146-4ba...|  62988| 2297370|08ef26b6-d96c-474...|         8.39|2022-10-24 03:00:...|
|0149d94f-6db0-496...|  62988| 2297369|08ef26b6-d96c-474...|         8.39|2022-10-24 03:00:...|
+--------------------+-------+--------+--------------------+-------------+--------------------+
only showing top 5 rows



In [4]:
n_selections_by_bet_id = df.groupBy('bet_id').count().withColumnRenamed('count', 'n_selections')
n_selections_by_bet_id.printSchema()

root
 |-- bet_id: string (nullable = true)
 |-- n_selections: long (nullable = false)



In [6]:
import pyspark.sql.functions as F

bet_stake_eur_max_by_user_id = df.groupBy('user_id').agg(F.max('bet_stake_eur').alias('bet_stake_eur_max'))
bet_stake_eur_max_by_user_id.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- bet_stake_eur_max: double (nullable = true)



In [7]:
df = df.join(n_selections_by_bet_id, on='bet_id').join(bet_stake_eur_max_by_user_id, on='user_id')
df = df.selectExpr('user_id', 'match_id', 'bet_stake_eur / bet_stake_eur_max AS rating')
df.show(n=5)

                                                                                

+-------+--------+--------------------+
|user_id|match_id|              rating|
+-------+--------+--------------------+
|  68098| 2018940|0.018796992481203006|
|  36224| 2333712|  0.1087450312322544|
|  36224| 2330604|  0.1087450312322544|
|  69637| 2297115|  0.6666666666666666|
|   6658| 2189458| 0.24800671422576584|
+-------+--------+--------------------+
only showing top 5 rows



In [8]:
train, test = df.randomSplit([0.8, 0.2])

In [18]:
from pyspark.ml.recommendation import ALS

model = ALS(
    userCol="user_id",
    itemCol="match_id",
    ratingCol="rating",
    coldStartStrategy="drop",
    implicitPrefs=True
)
model = model.fit(train)

                                                                                

In [19]:
from pyspark.ml.evaluation import RegressionEvaluator

pred = model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
evaluator.evaluate(pred)

                                                                                

11.543335140339533

In [20]:
from pyspark.sql.functions import explode

users = df.select(model.getUserCol()).distinct().limit(5)
result = model.recommendForUserSubset(users, 4)
print(result.show(n=5))
result = result\
    .withColumn('recommendations', explode(result.recommendations))\
    .select('user_id', 'recommendations.match_id', 'recommendations.rating')

result.show(n=5)

                                                                                

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|  18944|[{2297366, 1.5231...|
|  36224|[{2187870, 0.9611...|
|  69637|[{2303562, 1.0657...|
|   6658|[{2319745, 0.4947...|
|  68098|[{2297366, 0.8162...|
+-------+--------------------+

None


                                                                                

+-------+--------+---------+
|user_id|match_id|   rating|
+-------+--------+---------+
|  18944| 2297366|1.5231276|
|  18944| 2297368|1.5015057|
|  18944| 2297365|1.4159067|
|  18944| 2297369|1.3904393|
|  36224| 2187870|0.9611831|
+-------+--------+---------+
only showing top 5 rows



In [21]:
df_matches = spark.read.csv('../../docs/coolbet/data/prod/items.csv', header=True, inferSchema=True).selectExpr('id AS match_id', 'name', 'league', 'region', 'sport')
df_matches.show(n=5)

+--------+--------------------+--------------------+-------+----------+
|match_id|                name|              league| region|     sport|
+--------+--------------------+--------------------+-------+----------+
| 2330802|Hurrell, J - Walt...|Icons of Darts Li...| Europe|     Darts|
| 2333567|Dinamo-Erbasu Buc...|      Division A Men|Romania|Basketball|
| 2335031|         HSIL - KFUM|         Eliteserien| Norway|    Futsal|
| 2327153|Piteå HC - Bodens HF|   Hockeyettan Norra| Sweden|Ice Hockey|
| 2330716|Dorking Wanderers...|     National League|England|  Football|
+--------+--------------------+--------------------+-------+----------+
only showing top 5 rows



In [22]:
result.join(df_matches, on='match_id').show()

                                                                                

+--------+-------+----------+--------------------+--------------------+-------+--------+
|match_id|user_id|    rating|                name|              league| region|   sport|
+--------+-------+----------+--------------------+--------------------+-------+--------+
| 2297366|  18944| 1.5231276|RB Leipzig - Real...|UEFA Champions Le...| Europe|Football|
| 2297368|  18944| 1.5015057|Borussia Dortmund...|UEFA Champions Le...| Europe|Football|
| 2297365|  18944| 1.4159067|Dinamo Zagreb - A...|UEFA Champions Le...| Europe|Football|
| 2297369|  18944| 1.3904393|SL Benfica - Juve...|UEFA Champions Le...| Europe|Football|
| 2187870|  36224| 0.9611831|Arsenal - Notting...|      Premier League|England|Football|
| 2303534|  36224|0.89805305|Real Madrid - Girona|             La Liga|  Spain|Football|
| 2319745|  36224| 0.8768701|   Torino - AC Milan|             Serie A|  Italy|Football|
| 2187877|  36224|0.84548813|Manchester United...|      Premier League|England|Football|
| 2303562|  69637| 1.