In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

import os
import re
from tqdm.notebook import tqdm

https://stackoverflow.com/questions/574730/python-how-to-ignore-an-exception-and-proceed

In [None]:
dir_raw = 'data_raw/'
dir_preprocessed = 'data_preprocessed/'

In [None]:
df_raw = pd.read_csv(dir_raw+'matches_male.csv', index_col = 0)

In [None]:
df_raw.head(20)

# check if rows either have 'a bt b' or if they are bye

In [None]:
bt_indices = df_raw.players.str.contains('bt')
bye_indices = (df_raw.result == 'bye')

In [None]:
df_raw[~(bt_indices|bye_indices)]

In [None]:
df_raw[(bt_indices & bye_indices)]

# select subset where match is actually played, rather than bye

In [None]:
def remove_bye_misc(df):
    df_copy = df.copy()
    return df_copy.loc[df.players.str.contains('bt')]

In [None]:
df = remove_bye_misc(df_raw)

# sort it so it is in chronological order, rather than reverse chronological

In [None]:
df.sort_index(ascending = False, inplace = True)

# extract player information

In [None]:
def split_players(df):
    df_new = df.copy()
    df_new['players'] = df.players.str.split(pat = ' bt ')
    df_new['winner'] = df_new.players.map(lambda x: x[0])
    df_new['loser'] = df_new.players.map(lambda x: x[1])
    return df_new

In [None]:
df = split_players(df)

In [None]:
def extract_player_information(df):
    df_new = df.copy()
    
    pat_seed = r'(\[(?P<seed>\S+)\] )?'
    pat_name = r'(?P<name>(?:\w+\b )+\w+\b)'
    pat_country = r' \((?P<country>[A-Z]{3,3})\)'
    pat = pat_seed+pat_name+pat_country
    
    winners = df_new.winner.to_list()
    winner_seeds = []
    winner_names = []
    winner_countries = []
    for winner in tqdm(winners):
        match = re.search(pat, winner)
        try:
            winner_seeds.append(match.group('seed'))
        except AttributeError:
            winner_seeds.append(np.nan)
        try:
            winner_names.append(match.group('name'))
        except AttributeError:
            winner_names.append(np.nan)
        try:
            winner_countries.append(match.group('country'))
        except AttributeError:
            winner_countries.append(np.nan)
    df_new['winner_seed'] = winner_seeds
    df_new['winner_name'] = winner_names
    df_new['winner_country'] = winner_countries
    
    losers = df_new.loser.to_list()
    loser_seeds = []
    loser_names = []
    loser_countries = []
    for loser in tqdm(losers):
        match = re.search(pat, loser)
        try:
            loser_seeds.append(match.group('seed'))
        except AttributeError:
            loser_seeds.append(np.nan)
        try:
            loser_names.append(match.group('name'))
        except AttributeError:
            loser_names.append(np.nan)
        try:
            loser_countries.append(match.group('country'))
        except AttributeError:
            loser_countries.append(np.nan)
    df_new['loser_seed'] = loser_seeds
    df_new['loser_name'] = loser_names
    df_new['loser_country'] = loser_countries
    
    return df_new

In [None]:
df = extract_player_information(df)

# manual changes

In [None]:
df.winner_name.replace('Zahed Mohamed', 'Zahed Salem', inplace = True)
df.loser_name.replace('Zahed Mohamed', 'Zahed Salem', inplace = True)

# drop unnecessary columns

In [None]:
df = df.drop(columns = ['players', 'winner', 'loser'])

In [None]:
df.head()

# process 'results' column, to get scores of matches in games

In [None]:
def determine_game_score(scores):
    """
    input is of the form '11-8, 7-11, 11-9, 12-10'
    output is number of games the winner won and number of games loser won
    """
    scores = [[int(i) for i in score.split('-')] for score in scores.split(', ') ]
    winner_nGames = 0
    loser_nGames = 0
    for score in scores:
        if score[0]>score[1]:
            winner_nGames += 1
        elif score[1] > score[0]:
            loser_nGames +=1
    
    return winner_nGames, loser_nGames

In [None]:
# testing pattern used in next cell
pat = r'(?P<scores>[\d, -]+\d)(?: \((?P<time>\d+)m\))?'
test = '6-9, 9-1, 6-9, 10-8, 9-6'

match = re.match(pat, test)
scores = match.group('scores')
w,l = determine_game_score(scores)
time = match.group('time')

print(scores)
print(w)
print(l)
print(time)

In [None]:
def process_result_column(df):
    results = df.result.to_list()
    results_processed = []
    nGames = []
    best_of = []
    times = []

    for result in tqdm(results):
        if 'w/o' in result:
            results_processed.append('wo')
            nGames.append(np.nan)
            best_of.append(np.nan)
            times.append(np.nan)
            continue

        if 'ret' in result:
            results_processed.append('ret')
            nGames.append(np.nan)
            best_of.append(np.nan)
            times.append(np.nan)
            continue

        if 'unknown' in result:
            results_processed.append('unknown')
            nGames.append(np.nan)
            best_of.append(np.nan)
            times.append(np.nan)
            continue

        if '3/' in result:
            # then result is of the form "3/i" or "3/i (15m)" where i=0,1 or 2
            results_processed.append(result[:3])
            nGames.append(int(result[2])+3)
            best_of.append(5)
            times.append(np.nan)
            continue

        if '2/' in result:
            # then result is of the form "2/i" or "2/i (15m)" where i=0,1
            results_processed.append(result[:3])
            nGames.append(int(result[2])+2)
            best_of.append(3)
            times.append(np.nan)
            continue

        pat = r'(?P<scores>[\d, -]+\d)(?: \((?P<time>\d+)m\))?'
        try:
            match = re.match(pat, result)
            scores = match.group('scores')
            w,l = determine_game_score(scores)
            time = match.group('time')

            results_processed.append(f'{w}/{l}')
            nGames.append(w+l)
            best_of.append(w*2-1)
            times.append(time)
        except:
            print(result)
            continue

    df['results_processed'] = results_processed
    df['nGames'] = nGames
    df['best_of'] = best_of
    df['times'] = times
    
    return df

In [None]:
df = process_result_column(df)

In [None]:
df.head()

In [None]:
df.nGames.value_counts()

In [None]:
df.results_processed.value_counts()

# save frame

In [None]:
df.to_csv(dir_preprocessed+'matches_male.csv')