In [24]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

import os
import re
from tqdm.notebook import tqdm

# Summary
In this notebook I carry out EDA, cleaning and processing of data.

* tournaments_male.csv. Decided this is not useful so did nothing with this
* matches_male.csv.
    * Dropped the columns `tournament_index` and `round`

### Load data

In [25]:
dir_raw = '../data/raw/'
dir_processed = '../data/processed'

In [26]:
os.listdir(dir_raw)

['.gitkeep', 'tournaments_male.csv', '.ipynb_checkpoints', 'matches_male.csv']

In [27]:
matches = pd.read_csv(dir_raw+'matches_male.csv', index_col = 0)
tournaments = pd.read_csv(dir_raw+'tournaments_male.csv', index_col = 0)

In [28]:
matches.head(5)

Unnamed: 0,tournament_index,round,players,result
0,0,Quarter-finals,[1] Tayyab Aslam (PAK) bt Farhan Hashmi (PAK),"7-11, 11-9, 11-5, 11-5 (32m)"
1,0,Quarter-finals,[7] Israr Ahmed (PAK) bt [9/16] Waqas Mehboob ...,"11-3, 11-3, 11-8 (23m)"
2,0,Quarter-finals,[4] Amaad Fareed (PAK) bt [5] Farhan Zaman (PAK),"11-8, 11-7, 12-10 (25m)"
3,0,Quarter-finals,[9/16] Nasir Iqbal (PAK) bt [2] Asim Khan (PAK),"11-9, 11-2, 7-11, 11-5 (42m)"
4,0,2nd round,[1] Tayyab Aslam (PAK) bt [9/16] Noor Zaman (PAK),"11-1, 11-9, 11-9 (25m)"


In [29]:
tournaments.head(5)

Unnamed: 0,index,tournament_type,name,location,date,url,matches_downloaded
0,0,Challenger Tour 10,Pakistan International Championship (M),"Islamabad, Pakistan",11 Dec 2020,http://www.squashinfo.com/events/713-mens-paki...,True
1,1,Challenger Tour 10,Sihltal Classic (M),"Langnau am Albis, Switzerland",6 Dec 2020,http://www.squashinfo.com/events/8603-mens-sih...,True
2,2,Challenger Tour 10,BISL International (M),"Quetta, Pakistan",28 Nov 2020,http://www.squashinfo.com/events/8594-mens-bis...,True
3,3,Challenger Tour 10,Liechtenstein Open (M),"Vaduz, Liechtenstein",22 Nov 2020,http://www.squashinfo.com/events/8587-mens-lie...,True
4,4,Challenger Tour 5,Austrian Open (M),"Salzburg, Austria",8 Nov 2020,http://www.squashinfo.com/events/8586-mens-aus...,True


For the aim of the project, this tournament dataframe is not necessary so I will not explore it further

### `tournament_index` and `round`

In [30]:
# drop tournament index as we do not need tournament information for elo
matches = matches.drop(columns=['tournament_index'])

In [31]:
matches['round'].value_counts()

1st round                  23348
1st qualifying round       13471
Quarter-finals              9244
Qualifying finals           8615
2nd round                   4779
Semi-finals                 4676
Final                       2757
2nd qualifying round         319
3rd round                    297
1st preliminary round        111
1st pool round                61
2nd pool round                60
3rd pool round                60
Final preliminary round       58
2nd preliminary round         54
3rd qualifying round          44
Third place play-off          20
3rd preliminary round         16
4th qualifying round          16
4th round                     16
Fifth place play-off           3
Seventh place play-off         3
4th preliminary round          2
Final: (26 Mar 2010)           1
5th preliminary round          1
Final: (10 Sep 2007)           1
Name: round, dtype: int64

In [32]:
matches = matches.drop(columns=['round'])

We can see that the round column is a little bit dirty. However, like tournament index, it is actually not useful for us so I decided to drop it.

### `result`, dropping bye

A manual skim through some values showed that the result columns sometimes just equals "bye".

In [15]:
bye_indices = (matches.result == 'bye')
matches[bye_indices]

Unnamed: 0,players,result
12,[1] Tayyab Aslam (PAK),bye
15,[8] Zahir Shah (PAK),bye
16,[7] Israr Ahmed (PAK),bye
19,[3] Farhan Mehboob (PAK),bye
20,[4] Amaad Fareed (PAK),bye
...,...,...
67901,Mark Maclean (SCO),bye
67902,Nasser Zahran (EGY),bye
67903,Mohammed Awad (EGY),bye
67904,Jansher Khan (PAK),bye


Looking at these values, this happens when a player is not paired up with somebody and so automatically qualifies to the next round. Hence, we can drop the rows with 'bye'

In [17]:
matches = matches[~bye_indices]

### result, extracting information from results column

The next few functions took several iterations and experimentation to create. I did not record the process by which I incrementally improved the function.

In [33]:
def determine_game_score(points: str) -> (int, int):
    """
    Determine the score in games of a match.
    
    Parameters
    ----------
    points : str
        The points in each game of a match. Should be something
        like '11-8, 7-11, 11-9, 12-10'
        * points for each game separated by ', '
        * points of players in a single game separated by '-'
        * in each game, the first number is the points of the
        winner of the match
    
    Returns
    -------
    n_games_won_by_winner : int
        The number of games won by the winner of the match
    
    n_games_won_by_loser : int
        The number of games won by the loser of the match
    """
    # parse the input string
    points_parsed = [
        [int(i) for i in point.split('-')]
        for point in points.split(', ')
    ]
    
    # calculate number of games won by winner and loser.
    # convention is that for each game, the first number
    # in the pair is the points of the winner of the match.
    n_games_won_by_winner = 0
    n_games_won_by_loser = 0
    for points_winner, points_loser in points_parsed:
        if points_winner>points_loser:
            n_games_won_by_winner += 1
        elif points_loser > points_winner:
            n_games_won_by_loser +=1
    
    return n_games_won_by_winner, n_games_won_by_loser

In [34]:
# manual testing of determine_game_score
print(determine_game_score('11-8, 7-11, 11-9, 12-10'))
print(determine_game_score('1-11, 11-3, 11-9'))

(3, 1)
(2, 1)
