In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import kendalltau, spearmanr
%matplotlib inline

## Task 1 - load & filter data

In [2]:
tour_df = pd.read_pickle('tournaments.pkl')
player_df = pd.read_pickle('players.pkl')
results_df = pd.read_pickle('results.pkl')

In [3]:
def get_tours_for_year(year, tours_dict=tour_df, results_dict=results_df):
    result = []
    for key, value in tours_dict.items():
        if value['dateStart'][:4] == str(year):
            tour_id = value['id']
            if len(results_dict[tour_id]) > 0 and 'mask' in results_dict[tour_id][0].keys() and results_dict[tour_id][0]['mask']:
                result.append(tour_id)
    return result

In [4]:
train_tours = get_tours_for_year(2019)
test_tours = get_tours_for_year(2020)
print(len(train_tours), len(test_tours))

674 173


In [5]:
datasets = {'train': train_tours, 'test': test_tours}

## Tasks 2 & 3 - Simple baseline model and predictions

In [6]:
def get_array_from_mask(mask):
    if mask:
        arr = []
        for c in mask:
            if c.isdigit():
                arr.append(int(c))
        return np.array(arr, dtype=np.int8)

In [7]:
player_ratings = {}
for cur in train_tours:
    cur_tour = results_df[cur]
    for team in cur_tour:
        if ('mask' not in team.keys()) or team['mask'] == None:
            continue
        mask_arr = get_array_from_mask(team['mask'])
        player_ids = [p['player']['id'] for p in team['teamMembers']]
        for p_id in player_ids:
            if p_id in player_ratings.keys():
                player_ratings[p_id][0] += sum(mask_arr)
                player_ratings[p_id][1] += len(mask_arr)
            else:
                player_ratings[p_id] = [sum(mask_arr), len(mask_arr)]

In [8]:
float_ratings = sorted([(key, value[0] / value[1]) for key, value in player_ratings.items()],
                       key=lambda x: x[1], reverse=True)

In [9]:
DEFAULT_RATING = np.median([x[1] for x in float_ratings]) #value to fill if new user is met
print(DEFAULT_RATING)

0.3356692464838532


In [10]:
def get_real_ranking(tour):
    """returns team members and team ranks from passed tournament"""
    members = []
    ranks = []
    for team in tour:
        ranks.append(team['position'])
        members.append([p['player']['id'] for p in team['teamMembers']])
    return members, ranks

In [14]:
def calc_team_rating(members, player_ratings=player_ratings, mode='average'):
    """Calculate team rating based on players ratings. 2 modes - average and 
    linear: r1 + 1/2 * r2 + ... + 1/n * rn"""
    rating = 0
    if len(members) == 0:
        return 0
    if mode == 'average':
        for m in members:
            try:
                rating += player_ratings[m][0] / player_ratings[m][1]
            except:
                rating += DEFAULT_RATING
        return rating / len(members)
    elif mode == 'linear':
        tmp_r = []
        for m in members:
            try:
                tmp_r.append(player_ratings[m][0] / player_ratings[m][1])
            except:
                tmp_r.append(DEFAULT_RATING)
        tmp_r = sorted(tmp_r, reverse=True)
        for i, r in enumerate(tmp_r):
            rating += r / (i + 1)
        return rating
#         for i, m in enumerate(members):
#             try:
#                 rating += (player_ratings[m][0] / player_ratings[m][1]) * (1 / (i + 1))
#             except:
#                 rating += DEFAULT_RATING * (1 / (i + 1))
#         return rating

In [12]:
#Correlations with simple predictions and average team rating
for name, tours in datasets.items():
    spearmans = []
    kendalls = []
    for tour in tours:
        members, ranks = get_real_ranking(results_df[tour])
        predicted_ratings = []
        for team in members:
            predicted_ratings.append(calc_team_rating(team, mode='average'))
    #     corr = np.corrcoef(ranks, predicted_ratings)[0][1]
        sc = spearmanr(ranks, predicted_ratings)[0]
        kc = kendalltau(ranks, predicted_ratings)[0]
        if sc is not np.nan and kc is not np.nan:
            spearmans.append(sc)
            kendalls.append(kc)
    #     print(f'Tour id: {tour}, n_teams: {len(ranks)}, spearman: {spearman_corr}, kendall: {kendal_corr}')
    print(f'Dataset {name}: Avg. Spearman: {np.mean(spearmans)}, Avg. Kendall: {np.mean(kendalls)}')



Dataset train: Avg. Spearman: -0.7937356425192034, Avg. Kendall: -0.6406845079670468
Dataset test: Avg. Spearman: -0.6984761076492578, Avg. Kendall: -0.5408720440451393


In [13]:
#Correlations with simple predictions and linear team rating
for name, tours in datasets.items():
    spearmans = []
    kendalls = []
    for tour in tours:
        members, ranks = get_real_ranking(results_df[tour])
        predicted_ratings = []
        for team in members:
            predicted_ratings.append(calc_team_rating(team, mode='linear'))
    #     corr = np.corrcoef(ranks, predicted_ratings)[0][1]
        sc = spearmanr(ranks, predicted_ratings)[0]
        kc = kendalltau(ranks, predicted_ratings)[0]
        if sc is not np.nan and kc is not np.nan:
            spearmans.append(sc)
            kendalls.append(kc)
    #     print(f'Tour id: {tour}, n_teams: {len(ranks)}, spearman: {spearman_corr}, kendall: {kendal_corr}')
    print(f'Dataset {name}: Avg. Spearman: {np.mean(spearmans)}, Avg. Kendall: {np.mean(kendalls)}')

Dataset train: Avg. Spearman: 0.7873797452065571, Avg. Kendall: 0.6328318764764276
Dataset test: Avg. Spearman: 0.7028479154659221, Avg. Kendall: 0.5463412217059841
