General task : 

   - The goal of this task is to create personalized recommendations for football matches
   (events) for users over a one-week period. Personalized recommendations are crucial in
   numerous applications to ensure users are satisfied with the content provided. In this case,
   you will generate personalized recommendations of football matches for a one-week period
   based on the teams each user follows.

Data retrival

In [41]:
from clickhouse_driver import Client
import pandas as pd
import ast
import datetime
import pickle 
from collections import defaultdict 
import json


In [42]:
bqClient = Client(
user='mharalovic',
password='Fs75EePJ3m54EyysB75U',
host='clickhouse.sofascore.ai',
port='9000',
)

Retriving all football teams that played a match between '20230101' and '20230630' from the table sports.event.

In [43]:
query = """    SELECT t.id
               FROM sports.event AS e
               LEFT JOIN sports.sport AS s ON e.sport_id = toInt8(s.id)
               LEFT JOIN sports.team AS t ON e.hometeam_id = t.id
               WHERE s.name = 'Football'
               AND toYYYYMMDD(e.startdate) BETWEEN '20230101' AND '20230630'
               UNION DISTINCT
               SELECT t1.id
               FROM sports.event AS e
               LEFT JOIN sports.sport AS s ON e.sport_id = toInt8(s.id)
               LEFT JOIN sports.team AS t1 ON e.awayteam_id = t1.id
               WHERE s.name = 'Football'
               AND toYYYYMMDD(e.startdate) BETWEEN '20230101' AND '20230630';
       """

In [44]:
team_ids = bqClient.execute(query)

In [45]:
teams_df = pd.DataFrame(team_ids, columns=['team_id'])
teams_df.to_csv('teams.csv', index=False)

Retrieving all users data 

In [46]:
query2 = """ SELECT user_account_id, teams, mcc
            FROM bq.mobileuser
            WHERE user_account_id IS NOT NULL
               AND teams IS NOT NULL
               AND length(teams) > 0
               AND toYYYYMMDD(created_at) <= '20230630'
               AND toYYYYMMDD(updated_at) <= '20230630'
               AND mcc IN (216, 218, 219, 220, 221, 222, 226, 232, 262, 276, 284, 293, 294,297)
            ORDER BY mcc DESC
        """

In [47]:
user_data =  bqClient.execute(query2)

In [48]:
user_data_df = pd.DataFrame(user_data, columns=['user_account_id', 'teams', 'mcc'])
user_data_df['teams'] = user_data_df['teams'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
user_data_df['teams'] = user_data_df['teams'].apply(tuple)
user_data_df = user_data_df.drop_duplicates()

Removing 47 users that had multiple MCC values (identical rows, simply having 2+ MCC values)

In [49]:
user_counts = user_data_df['user_account_id'].value_counts()
users_with_multiple_records = user_counts[user_counts > 1].index

user_data_df = user_data_df[~user_data_df['user_account_id'].isin(users_with_multiple_records)]

In [50]:
user_data_df.to_csv('./data/user_data.csv',index=False)

In [51]:
user_data_df.head(1)

Unnamed: 0,user_account_id,teams,mcc
0,5bc624c559a182e430b5c8b4,"(5152, 14882, 6637)",297


Data preprocessing tasks 
- filter the teams that each user follows by retaining only those teams that are playing relevant events
- if a user does not follow any of those teams, discard the user

1st task -> getting relevant teams

In [52]:
user_data_df['teams'] = user_data_df['teams'].apply(lambda team_list: [team for team in team_list if team in teams_df['team_id'].values])

In [53]:
user_data_df = user_data_df[user_data_df['teams'].apply(len) > 0]

- Grouping user by MCC and getting the teams that each user follow
- Creating dict afterwards -> key is mcc, values are 'teams' and 'user_account_id', which hold all the inf about the teams and user account connected to specific group (mcc) -> for O(1) lookups

In [54]:
grouped_by_mcc = user_data_df.groupby('mcc').agg({
    'user_account_id': lambda x: list(x),
    'teams': lambda x: list(set(sum(x, [])))
}).reset_index().set_index('mcc')

In [55]:
grouped_by_mcc = grouped_by_mcc.apply(lambda row: row.update({'user_account_id': sorted(row['user_account_id']), 'teams': sorted(row['teams'])}) or row, axis=1)

In [56]:
grouped_by_mcc_dict = grouped_by_mcc.to_dict(orient='index')

Saving dict and dataframe locally

In [57]:
import pickle 

with open('./data/grouped_by_mcc_dict.pkl', 'wb') as f:
    pickle.dump(grouped_by_mcc_dict, f)

In [58]:
user_data_df.to_csv('./data/user_data.csv', index=False)
user_data_df.to_csv('./data/user_data.csv',index=False)


In [59]:
user_data_dict = user_data_df.set_index('user_account_id').to_dict(orient='index')

with open('./data/user_data_dict.pkl', 'wb') as f:
    pickle.dump(user_data_dict, f)

Retrieving events data

In [60]:
query3 = """
        SELECT 
            toStartOfWeek(startdate, 1) AS week_start,
            groupArray(id) AS event_ids,
            groupArray((hometeam_id, awayteam_id)) AS team_ids
         FROM
            sports.event
         LEFT JOIN
            sports.sport s ON event.sport_id = toInt8(s.id)
         WHERE
            toYYYYMMDD(startdate) >= '20230601' AND toYYYYMMDD(startdate) <= '20230630' AND s.name = 'Football'
         GROUP BY 
            week_start
         ORDER BY 
            week_start
        """

In [61]:
events_data = bqClient.execute(query3)

In [62]:
events_data = pd.DataFrame(events_data, columns=['week_start', 'event_ids', 'team_ids'])
events_data.to_csv('./data/events_data.csv', index=False)

In [63]:
events_data_dict = events_data.set_index('week_start').to_dict(orient='index')
with open('./data/events_data_dict.pkl', 'wb') as f:
    pickle.dump(events_data_dict, f)

Recommendation system 
- recommendations should be generated for events based on the teams that each user
follows.
- we recommend an event to a user if the user follows any of the two
teams playing in that event.

In [65]:
resolved_users, non_resolved_users = [],[]
event_based_user_recommendation = {}
user_based_user_recommendation = {}

In [66]:
dates = [datetime.date(2023, 5, 29), 
         datetime.date(2023, 6, 5),
         datetime.date(2023, 6, 12), 
         datetime.date(2023, 6, 19), 
         datetime.date(2023, 6, 26)
         ]

Mappping, for each data, map team_id with event id

In [67]:
teams_to_event_mapping = {date : {} for date in dates}

In [68]:
for date in dates:
   for event_id,teams in zip(events_data_dict[date]['event_ids'],events_data_dict[date]['team_ids']):
      for team in teams:
         if team not in teams_to_event_mapping[date]:
            teams_to_event_mapping[date][team] = []
         teams_to_event_mapping[date][team].append(event_id)  

Event based recommendation

In [69]:
for user, user_data in user_data_dict.items():
    event_based_user_recommendation[user] = {}
    for date in dates:
        event_ids_for_date = set()
        for team in user_data['teams']:
            if team in teams_to_event_mapping[date]:
                event_ids_for_date.update(teams_to_event_mapping[date][team])
        if event_ids_for_date:
            event_based_user_recommendation[user][date] = list(event_ids_for_date)
    if all(date in event_based_user_recommendation[user] for date in dates):
        resolved_users.append(user)
    else:
        non_resolved_users.append(user)

In [70]:
resolved_users_by_mcc = defaultdict(list)
for user in resolved_users:
    user_mcc = user_data_dict[user]['mcc']
    resolved_users_by_mcc[user_mcc].append(user)

User mappings based on similarity for each date

*Similarity* is defined ad : 
- for each user, looking at the users in their macc that have team followings from the previous task, calculate the number of teams they both follow and assign a score of similarity, then choose top 10 msot similar user to the user and give theirs recommendations for each period.

*Algorithm*
- for user *u* look at set of users *U* in the same macc
- calculate static similarity between each user based on the teams they follow (using Jaccard similarity, as there are set of items (teams) without specific continuous variable)
- for each date, filter users *U* based on the fact they got following recommendations
- get 10 most similar users in preferences and all of ther reccomendations add as user *u* recommendation to follow in the given week

- for each user, looking at the users in their macc that have team followings from the previous task, calculate the number of teams they both follow and assign a score of similarity, then choose top 10 msot similar user to the user and give theirs recommendations for each period.

*Algorithm*
- for user *u* look at set of users *U* in the same macc
- calculate static similarity between each user based on the teams they follow (using Jaccard similarity, as there are set of items (teams) without specific continuous variable)
- for each date, filter users *U* based on the fact they got following recommendations
- get 10 most similar users in preferences and all of ther reccomendations add as user *u* recommendation to follow in the given week

1.st -> precomputing the similarities between users in the same macc

In [71]:
def jaccard_similarity(set1, set2): #  accessed on https://www.geeksforgeeks.org/how-to-calculate-jaccard-similarity-in-python/ , 26.05.2024. at 14:09
    # intersection of two sets
    intersection = len(set1.intersection(set2))
    # Unions of two sets
    union = len(set1.union(set2))
     
    return intersection / union 

In [72]:
user_based_user_recommendation = defaultdict(lambda: defaultdict(list))

In [74]:
for user in non_resolved_users:
    user_teams = set(user_data_dict[user]['teams'])
    user_mcc = user_data_dict[user]['mcc']
    similar_users = []
    
    for resolved_user in resolved_users_by_mcc[user_mcc]:
        resolved_user_teams = set(user_data_dict[resolved_user]['teams'])
        similarity = jaccard_similarity(user_teams, resolved_user_teams)
        similar_users.append((resolved_user, similarity))
    
    similar_users.sort(key=lambda x: x[1], reverse=True)
    top_similar_users = [u for u, _ in similar_users[:10]]

    for date in dates:
        recommendations = set()
        for similar_user in top_similar_users:
            if date in event_based_user_recommendation[similar_user]:
                recommendations.update(event_based_user_recommendation[similar_user][date])
        if recommendations:
            user_based_user_recommendation[user][date] = list(recommendations)

Total execution time locally on a CPU : 5m 24.7 s

Reccomendation system is done, save data locally

In [87]:
user_recommendation = defaultdict(lambda: defaultdict(list))

In [88]:
for user in user_data_dict.keys():
    for date in dates:
        event_recs = event_based_user_recommendation.get(user, {}).get(date, [])
        user_recs = user_based_user_recommendation.get(user, {}).get(date, [])
        combined_recs = set(event_recs).union(set(user_recs))
        if combined_recs:
            user_recommendation[user][str(date)] = list(combined_recs)

In [89]:
user_recommendation = {user: dict(dates) for user, dates in user_recommendation.items()}

In [90]:
with open('./recommendations/user_recommendation.json', 'w') as f:
    json.dump(user_recommendation, f, indent=4)