In [None]:
import numpy as np
import pandas as pd

# Process data

This notebook processes the raw data. The processing is just grouping the data by player and computing various aggregate statistics.

Input files (created by `01-scrape.ipynb`):

* 'female_raw.csv'
* 'male_raw.csv'

Output files:

* 'female_processed.csv'
* 'male_processed.csv'

In [None]:
def create_player_statistics(df):
    """
    create dataframe of aggregate statistics of individual players based on df
    
    Args:
        df
            dataframe containing raw data with columns: rank, year, and player
    Returns:
        players_df
            dataframe containing aggregate player statistics from df
    """
    players = df.groupby("player").agg(
        {"rank": [np.mean, "count", np.min, np.max], "year": [np.min, np.max]}
    )

    players.columns = [
        "average_rank",
        "years_in_top10",
        "best_rank",
        "worst_rank",
        "earliest_year",
        "latest_year",
    ]
    players.sort_values(by=["average_rank"], inplace=True)

    return players

In [None]:
players_f = create_player_statistics(pd.read_csv('female_raw.csv', index_col=0))
players_m = create_player_statistics(pd.read_csv('male_raw.csv', index_col=0))

In [None]:
players_m.to_csv('male_processed.csv')
players_f.to_csv('female_processed.csv')