In [1]:
import pandas as pd
import numpy as np

In [6]:
df_items = pd.read_csv(
    '../../docs/coolbet/data/prod/items.csv',
    index_col=0,
    dtype={
        'home_team_name': 'category',
        'away_team_name': 'category',
        'league': 'category',
        'region': 'category',
        'sport': 'category'
    },
    usecols=['id', 'home_team_name', 'away_team_name', 'league', 'region', 'sport']
)
df_items.head()

Unnamed: 0_level_0,home_team_name,away_team_name,league,region,sport
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2330802,"Hurrell, J","Walters, S",Icons of Darts Live League,Europe,Darts
2333567,Dinamo-Erbasu Bucuresti,CSO Voluntari,Division A Men,Romania,Basketball
2335031,HSIL,KFUM,Eliteserien,Norway,Futsal
2327153,Piteå HC,Bodens HF,Hockeyettan Norra,Sweden,Ice Hockey
2330716,Dorking Wanderers FC,Aldershot Town,National League,England,Football


In [7]:
df_users = pd.read_csv(
    '../../docs/coolbet/data/prod/users.csv',
    index_col=0,
    usecols=['id', 'country', 'language'],
    dtype={
        'country': 'category',
        'language': 'category',
    },
)
df_users.head()

Unnamed: 0_level_0,language,country
id,Unnamed: 1_level_1,Unnamed: 2_level_1
54829,cl,CL
19376,,CL
35827,fi,FI
57367,fi,FI
52467,,NO


In [8]:
df_interactions = pd.read_csv(
    '../../docs/coolbet/data/prod/interactions.csv',
    usecols=['user_id', 'match_id', 'created_at'],
    parse_dates=['created_at']
)
df_interactions.set_index(['user_id', 'match_id'], inplace=True)
df_interactions.drop_duplicates(inplace=True, keep='last')
df_interactions.reset_index(inplace=True)
df_interactions.head()

Unnamed: 0,user_id,match_id,created_at
0,26380,2276004,2022-10-24 00:00:00.625400+00:00
1,62988,2298988,2022-10-24 00:00:00.937569+00:00
2,68725,2225804,2022-10-24 00:00:01.719828+00:00
3,63815,2324399,2022-10-24 00:00:02.191923+00:00
4,19912,2326566,2022-10-24 00:00:02.778176+00:00


## Content-based recommendations

In [20]:
def generate_category_profiles(key: str):
    df_joined = df_interactions.merge(
        df_items[[key]],
        left_on='match_id',
        right_index=True,
    )
    df_joined = df_joined[['user_id']].join(pd.get_dummies(df_joined[key]))
    df_joined = df_joined.groupby(['user_id']).sum()
    return df_joined.div(df_joined.sum(axis=1), axis=0)

generate_category_profiles('sport').head()

Unnamed: 0_level_0,American Football,Badminton,Bandy,Baseball,Basketball,Beach Volleyball,Boxing,Chess,Cricket,Cross Country Skiing,...,Rugby League,Rugby Union,Ski Jumping,Snooker,Specials,Table Tennis,Tennis,Trotting,Volleyball,eSports
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.526316,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0
2,0.107143,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022222
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0


In [21]:
def peek_user_interactions(user_id: str):
    return df_interactions[df_interactions['user_id'] == user_id].merge(
        df_items,
        left_on='match_id',
        right_index=True
    )
    

peek_user_interactions(1).head(10)

Unnamed: 0,user_id,match_id,created_at,home_team_name,away_team_name,league,region,sport
1544,1,2324526,2022-10-24 00:17:54.016465+00:00,Golden State Warriors,Sacramento Kings,NBA,North America,Basketball
10995,1,2326278,2022-10-24 02:34:27.282888+00:00,"Alcaraz Garfia, C","Draper, J",ATP Basel,Switzerland,Tennis
15457,1,2295392,2022-10-24 04:33:01.356802+00:00,Celta de Vigo,Getafe,La Liga,Spain,Football
60285,1,2325418,2022-10-24 18:49:02.641855+00:00,Miami Heat,Toronto Raptors,NBA,North America,Basketball
76454,1,2325418,2022-10-24 21:03:41.265103+00:00,Miami Heat,Toronto Raptors,NBA,North America,Basketball
116066,1,2297368,2022-10-25 02:58:28.798169+00:00,Borussia Dortmund,Manchester City,UEFA Champions League,Europe,Football
121871,1,2297368,2022-10-25 04:44:51.399902+00:00,Borussia Dortmund,Manchester City,UEFA Champions League,Europe,Football
116441,1,2297366,2022-10-25 03:04:03.421179+00:00,RB Leipzig,Real Madrid,UEFA Champions League,Europe,Football
180525,1,2297366,2022-10-25 16:47:14.750540+00:00,RB Leipzig,Real Madrid,UEFA Champions League,Europe,Football
121963,1,2326533,2022-10-25 04:46:37.289907+00:00,Oklahoma City Thunder,Los Angeles Clippers,NBA,North America,Basketball


In [26]:
# generate recommendations for user_id 1

def generate_item_profiles(key: str):
    return pd.get_dummies(df_items[key])

def train():
    user_profiles = {}
    item_profiles = {}

    keys = ['sport', 'region', 'league']
    for key in keys:
        user_profiles[key] = generate_category_profiles(key)
        item_profiles[key] = generate_item_profiles(key)
        
    # TODO: fix OOM - try sparse matrix for team attributes 
    # home_team_profiles = generate_category_profiles('home_team_name')
    # away_team_profiles = generate_category_profiles('away_team_name')
    # user_profiles = home_team_profiles + away_team_profiles

    # item_profiles = pd.get_dummies(df_items['home_team_name']).concat(df_items['away_team_name'])

    return (keys, user_profiles, item_profiles)


In [27]:
keys, user_profiles, item_profiles = train()

In [31]:
def recommend_items_for_user(user_id: str):
    # TODO: use ensemble model when loss function is defined.
    # until then all categorical attributes have equal weights.

    result = df_items[[]]

    for key in keys:        
        user_profile = user_profiles[key].loc[user_id]
        scores = (user_profile * item_profiles[key]).sum(axis=1)
        result = result.join(scores.to_frame(key))

    result = result.sum(axis=1).sort_values(ascending=False)
    return result

recommendations = recommend_items_for_user(1)
recommendations.head()

id
2325424    1.578947
2329827    1.578947
2328154    1.578947
2329177    1.578947
2329150    1.578947
dtype: float64

In [32]:
recommendations.to_frame('score').join(df_items)

Unnamed: 0_level_0,score,home_team_name,away_team_name,league,region,sport
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2325424,1.578947,Houston Rockets,Utah Jazz,NBA,North America,Basketball
2329827,1.578947,Charlotte Hornets,Sacramento Kings,NBA,North America,Basketball
2328154,1.578947,New York Knicks,Charlotte Hornets,NBA,North America,Basketball
2329177,1.578947,Brooklyn Nets,Dallas Mavericks,NBA,North America,Basketball
2329150,1.578947,Brooklyn Nets,Indiana Pacers,NBA,North America,Basketball
...,...,...,...,...,...,...
2324347,0.000000,Southern Mississippi Golden Eagles,Louisiana-Lafayette Ragin' Cajuns,NCAA,USA,American Football
2332421,0.000000,Edmonton Oil Kings,Calgary Hitmen,Western Hockey League,Canada,Ice Hockey
2331245,0.000000,"Jacoby, D","Rountree, K",UFC,UFC,MMA
2329815,0.000000,For The Win eSports,Enterprise,[CSGO] CCT Central Europe,CSGO,eSports


In [33]:
# what the user has interacted with so far
peek_user_interactions(1)

Unnamed: 0,user_id,match_id,created_at,home_team_name,away_team_name,league,region,sport
1544,1,2324526,2022-10-24 00:17:54.016465+00:00,Golden State Warriors,Sacramento Kings,NBA,North America,Basketball
10995,1,2326278,2022-10-24 02:34:27.282888+00:00,"Alcaraz Garfia, C","Draper, J",ATP Basel,Switzerland,Tennis
15457,1,2295392,2022-10-24 04:33:01.356802+00:00,Celta de Vigo,Getafe,La Liga,Spain,Football
60285,1,2325418,2022-10-24 18:49:02.641855+00:00,Miami Heat,Toronto Raptors,NBA,North America,Basketball
76454,1,2325418,2022-10-24 21:03:41.265103+00:00,Miami Heat,Toronto Raptors,NBA,North America,Basketball
116066,1,2297368,2022-10-25 02:58:28.798169+00:00,Borussia Dortmund,Manchester City,UEFA Champions League,Europe,Football
121871,1,2297368,2022-10-25 04:44:51.399902+00:00,Borussia Dortmund,Manchester City,UEFA Champions League,Europe,Football
116441,1,2297366,2022-10-25 03:04:03.421179+00:00,RB Leipzig,Real Madrid,UEFA Champions League,Europe,Football
180525,1,2297366,2022-10-25 16:47:14.750540+00:00,RB Leipzig,Real Madrid,UEFA Champions League,Europe,Football
121963,1,2326533,2022-10-25 04:46:37.289907+00:00,Oklahoma City Thunder,Los Angeles Clippers,NBA,North America,Basketball
