In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA

# Scrape data 

In [None]:
urls = ['https://en.wikipedia.org/wiki/Official_Women%27s_Squash_World_Ranking',
    'https://en.wikipedia.org/wiki/Official_Men%27s_Squash_World_Ranking']

In [None]:
def is_not_numeric(s):
    try:
        float(s)
    except ValueError:
        return True
    else:
        return False


def table_to_pandas(table):
    """
    go to the urls to see what the tables look like. pointers
    * want to ignore first column (hence use `[1:]` in couple of places) 
      as that first column is just the ranks 1 to 10
    * use is_not_numeric to ignore the ranking points
    * top row of table has years
    
    returns pd.dataframe
        index is from 1 to 10
        column names are the years
        entries are player names
    """
    rows = table.find_all("tr")
    headers = [col.text.replace("\n", "") for col in rows[0].find_all("th")[1:]]

    data = [
        [
            col.text.replace("\n", "")
            for col in row.find_all("td")[1:]
            if is_not_numeric(col.text.replace("\n", ""))
        ]
        for row in rows[1:]
    ]

    return pd.DataFrame(data, columns=headers, index=range(1, 11))


def url_to_pandas(url):
    """
    given url, produce dataframe
    """
    html = requests.get(url).text
    start = html.find('id="Year_end_world_top_10_players')
    end = html.find('id="Year-end_number_1')
    tables = BeautifulSoup(html[start:end], "html.parser").find_all("table")
    
    # tables[1:] because first table does not fit the pattern of the other tables
    # go to the urls to see
    df = pd.concat([table_to_pandas(t) for t in tables[1:]], axis=1)
    
    # df.stack() creates new frame with multiindex consistenting of old
    # index and old columns. so df_stack multiindexx would be [rank, year] and have
    # single feature column of player names
    df_stack = df.stack().reset_index()
    df_stack.columns = ["rank", "year", "player"]

    return df_stack

In [None]:
df_m = url_to_pandas(urls[1])
df_f = url_to_pandas(urls[0])

In [None]:
df_m.to_csv('male_raw.csv')
df_f.to_csv('female_raw.csv')

# Process data

In [None]:
def player_summaries(df):
    players = df.groupby("player").agg(
        {"rank": [np.mean, "count", np.min, np.max], "year": [np.min, np.max]}
    )

    players.columns = [
        "average_rank",
        "years_in_top10",
        "best_rank",
        "worst_rank",
        "earliest_year",
        "latest_year",
    ]
    players.sort_values(by=["average_rank"], inplace=True)

    return players

In [None]:
players_f = player_summaries(pd.read_csv('female_raw.csv', index_col=0))
players_m = player_summaries(pd.read_csv('male_raw.csv', index_col=0))

In [None]:
players_m.to_csv('male.csv')
players_f.to_csv('female.csv')

# visuals and clustering and dimensionality reduction

In [None]:
players_m = pd.read_csv('male.csv', index_col=0)

In [None]:
players_m