In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

import os
import re
from tqdm.notebook import tqdm

# Summary
In this notebook I carry out EDA, cleaning and processing of data.

* tournaments_male.csv. Decided this is not useful so did nothing with this
* matches_male.csv.
    * Dropped the columns `tournament_index` and `round`

### Load data

In [None]:
dir_raw = "../data/raw/"
dir_processed = "../data/processed"

In [None]:
os.listdir(dir_raw)

In [None]:
matches = pd.read_csv(dir_raw + "matches_male.csv", index_col=0)
tournaments = pd.read_csv(dir_raw + "tournaments_male.csv", index_col=0)

In [None]:
matches.head(5)

In [None]:
tournaments.head(5)

For the aim of the project, this tournament dataframe is not necessary so I will not explore it further

### `tournament_index` and `round`

In [None]:
# drop tournament index as we do not need tournament information for elo
matches = matches.drop(columns=["tournament_index"])

In [None]:
matches["round"].value_counts()

In [None]:
matches = matches.drop(columns=["round"])

We can see that the round column is a little bit dirty. However, like tournament index, it is actually not useful for us so I decided to drop it.

### result, extracting information from results column

The next few functions took several iterations and experimentation to create. I did not record the process by which I incrementally improved the function.

In [None]:
def determine_result_type(result: str) -> (str):
    if "bye" in result:
        return "bye"

    if "w/o" in result:
        return "wo"

    if "ret" in result:
        return "ret"

    if "unknown" in result:
        return "unknown"

    if result == "No shows":
        return "other"

    if result == "Final not played due to unsafe court conditions":
        return "other"

    if "3/" in result:
        # then result is of the form "3/i" or "3/i (15m)" where i=0,1 or 2
        return "3/"

    if "2/" in result:
        # then result is of the form "2/i" or "2/i (15m)" where i=0,1
        return "2/"

    # pattern should match '11-3, 4-11, 11-9 (44m)'
    pat = r"(?P<points>[\d, -]+\d)(?: \((?P<time>\d+)m\))?"
    if re.match(pat, result):
        return "points"

    # if none of the above conditions were met, then
    # want to see what it is
    if True:
        print(result)

In [None]:
# manual testing of determine_result_type.
# If nothing is printed then presumption is that
# function is working
for result in matches.result.to_list():
    determine_result_type(result)

In [None]:
def determine_game_score_from_points(points: str) -> (int, int):
    """
    Determine the score in games of a match.

    Parameters
    ----------
    points : str
        The points in each game of a match. Should be something
        like '11-8, 7-11, 11-9, 12-10'
        * points for each game separated by ', '
        * points of players in a single game separated by '-'
        * in each game, the first number is the points of the
        winner of the match

    Returns
    -------
    n_games_won_by_winner : int
        The number of games won by the winner of the match

    n_games_won_by_loser : int
        The number of games won by the loser of the match
    """
    # parse the input string
    points_parsed = [[int(i) for i in point.split("-")] for point in points.split(", ")]

    # calculate number of games won by winner and loser.
    # convention is that for each game, the first number
    # in the pair is the points of the winner of the match.
    n_games_won_by_winner = 0
    n_games_won_by_loser = 0
    for points_winner, points_loser in points_parsed:
        if points_winner > points_loser:
            n_games_won_by_winner += 1
        elif points_loser > points_winner:
            n_games_won_by_loser += 1

    return n_games_won_by_winner, n_games_won_by_loser

In [None]:
# manual testing of determine_game_score
print(determine_game_score_from_points("11-8, 7-11, 11-9, 12-10"))
print(determine_game_score_from_points("1-11, 11-3, 11-9"))

In [None]:
def process_results_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process raw results column to get games won by winner and loser
    
    Parameters
    ----------
    df : pd.DataFrame
        dataframe containing a column 'result'
    
    Returns
    -------
    pd.DataFrame
        New dataframe that is copy of the original but with two
        additional columns, for the number of games won by
        the winner and loser respectively. 
    """
    df_copy = df.copy()
    
    games_winner = []
    games_loser = []
    results = df_copy.result.to_list()

    for result in tqdm(results):
        result_type = determine_result_type(result)
        
        if result_type in ['bye', 'wo', 'ret', 'unknown', 'other']:
            games_winner.append(np.nan)
            games_loser.append(np.nan)
        
        elif result_type in ['3/', '2/']:
            # then result is of the form "3/i ..." or "2/i ..."
            games_winner.append(int(result[0]))
            games_loser.append(int(result[2]))
        
        elif result_type == 'points':
            # we presume results is of standard form
            # "11-5, 11-9, 12-10 ..."
            pat = r"(?P<points>[\d, -]+\d)(?: \((?P<time>\d+)m\))?"
            match = re.match(pat, result)
            points = match.group('points')
            w, l = determine_game_score_from_points(points)
            
            games_winner.append(w)
            games_loser.append(l)
        
        else:
            # if functions created properly, this should never run
            print(result)
            print(result_type)

    df_copy['games_winner'] = games_winner
    df_copy['games_loser'] = games_loser
    
    return df_copy

In [None]:
matches_new = process_results_column(matches)
matches_new