In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA

# Scrape data 

In [None]:
urls = ['https://en.wikipedia.org/wiki/Official_Women%27s_Squash_World_Ranking',
    'https://en.wikipedia.org/wiki/Official_Men%27s_Squash_World_Ranking']

In [None]:
def is_not_numeric(s):
    try:
        float(s)
    except ValueError:
        return True
    else:
        return False


def table_to_pandas(table):
    """
    go to the urls to see what the tables look like. pointers
    * want to ignore first column (hence use `[1:]` in couple of places) 
      as that first column is just the ranks 1 to 10
    * use is_not_numeric to ignore the ranking points
    * top row of table has years
    
    returns pd.dataframe
        index is from 1 to 10
        column names are the years
        entries are player names
    """
    rows = table.find_all("tr")
    headers = [col.text.replace("\n", "") for col in rows[0].find_all("th")[1:]]

    data = [
        [
            col.text.replace("\n", "")
            for col in row.find_all("td")[1:]
            if is_not_numeric(col.text.replace("\n", ""))
        ]
        for row in rows[1:]
    ]

    return pd.DataFrame(data, columns=headers, index=range(1, 11))


def url_to_pandas(url):
    """
    given url, produce dataframe
    """
    html = requests.get(url).text
    start = html.find('id="Year_end_world_top_10_players')
    end = html.find('id="Year-end_number_1')
    tables = BeautifulSoup(html[start:end], "html.parser").find_all("table")
    
    # tables[1:] because first table does not fit the pattern of the other tables
    # go to the urls to see
    df = pd.concat([table_to_pandas(t) for t in tables[1:]], axis=1)
    
    # df.stack() creates new frame with multiindex consistenting of old
    # index and old columns. so df_stack multiindexx would be [rank, year] and have
    # single feature column of player names
    df_stack = df.stack().reset_index()
    df_stack.columns = ["rank", "year", "player"]

    return df_stack

In [None]:
df_m = url_to_pandas(urls[1])
df_f = url_to_pandas(urls[0])

In [None]:
df_m.to_csv('male_raw.csv')
df_f.to_csv('female_raw.csv')

# Process data

In [None]:
def player_summaries(df):
    players = df.groupby("player").agg(
        {"rank": [np.mean, "count", np.min, np.max], "year": [np.min, np.max]}
    )

    players.columns = [
        "average_rank",
        "years_in_top10",
        "best_rank",
        "worst_rank",
        "earliest_year",
        "latest_year",
    ]
    players.sort_values(by=["average_rank"], inplace=True)

    return players

In [None]:
players_f = player_summaries(pd.read_csv('female_raw.csv', index_col=0))
players_m = player_summaries(pd.read_csv('male_raw.csv', index_col=0))

In [None]:
players_m.to_csv('male.csv')
players_f.to_csv('female.csv')

# visuals and clustering and dimensionality reduction

In [5]:
m = pd.read_csv('male.csv', index_col=0)

In [6]:
m

Unnamed: 0_level_0,average_rank,years_in_top10,best_rank,worst_rank,earliest_year,latest_year
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Peter Nicol,2.4,10,1,8,1996,2005
Ali Farag,3.25,4,1,7,2016,2019
Jansher Khan,3.333333,3,1,8,1996,1998
Jonathon Power,3.666667,9,1,9,1997,2005
Mohamed El Shorbagy,3.7,10,1,10,2010,2019
Ramy Ashour,3.909091,11,1,7,2006,2016
Grégory Gaultier,4.0,15,1,10,2003,2018
Ahmed Barada,4.25,4,2,7,1997,2000
Rodney Eyles,4.333333,3,2,7,1996,1998
Nick Matthew,4.642857,14,1,10,2004,2017


In [48]:
pca = PCA()
m_pca = pca.fit_transform(m)
m_pca = pd.DataFrame(m_pca, columns = ['pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'pca6'],
                    index = m.index)
m_pca

Unnamed: 0_level_0,pca1,pca2,pca3,pca4,pca5,pca6
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Peter Nicol,-7.476936,8.752985,0.552572,-1.311669,-1.057366,0.525524
Ali Farag,15.612668,0.402603,5.826557,0.238659,-0.27062,0.36293
Jansher Khan,-13.292174,1.979958,3.979559,1.398703,-0.758381,0.547431
Jonathon Power,-6.857177,7.364381,0.123749,0.079605,-0.768221,0.227218
Mohamed El Shorbagy,12.278616,6.681417,0.356974,0.44134,-1.104049,0.468166
Ramy Ashour,7.390437,7.97735,0.633991,-2.073599,0.351077,-0.506991
Grégory Gaultier,7.351065,12.398622,-3.440164,-1.347248,-0.399891,0.369817
Ahmed Barada,-11.087805,1.99516,2.951062,-0.041546,-0.029137,-0.015498
Rodney Eyles,-13.324423,1.167023,3.369889,0.336136,-0.019619,0.007447
Nick Matthew,7.184409,10.885256,-2.92399,-0.74582,-0.41075,-0.31352


In [49]:
pca.transform(
    np.array(
        [[2.4, 10, 1, 8, 1996, 2005]]
    )
)

array([[-7.47693593,  8.75298503,  0.55257224, -1.3116692 , -1.0573659 ,
         0.52552363]])

In [37]:
pca.transform(np.array([[2, 8, 1, 5, 2000, 2010]]))

array([[-1.38538908,  7.25839028,  3.01209462, -3.21025431,  1.03348237,
         1.41643398]])

In [50]:
pca.inverse_transform(
    np.array(
        [[-7.47693593, 8.75298503, 0.55257224, -1.3116692, -1.0573659, 0.52552363]]
    )
)

array([[2.40000000e+00, 1.00000000e+01, 1.00000000e+00, 7.99999999e+00,
        1.99600000e+03, 2.00500000e+03]])

In [38]:
pca.inverse_transform(
    np.array(
        [[-1.38538908, 7.25839028, 3.01209462, -3.21025431, 1.03348237, 1.41643398]]
    )
)

array([[2.00000000e+00, 8.00000000e+00, 9.99999996e-01, 5.00000000e+00,
        2.00000000e+03, 2.01000000e+03]])

### Predictions of original data using subset of components

In [64]:
m_pca1 = m_pca.copy()
m_pca1[['pca2', 'pca3', 'pca4', 'pca5', 'pca6']] = 0
m_pca1

Unnamed: 0_level_0,pca1,pca2,pca3,pca4,pca5,pca6
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Peter Nicol,-7.476936,0.0,0.0,0.0,0.0,0.0
Ali Farag,15.612668,0.0,0.0,0.0,0.0,0.0
Jansher Khan,-13.292174,0.0,0.0,0.0,0.0,0.0
Jonathon Power,-6.857177,0.0,0.0,0.0,0.0,0.0
Mohamed El Shorbagy,12.278616,0.0,0.0,0.0,0.0,0.0
Ramy Ashour,7.390437,0.0,0.0,0.0,0.0,0.0
Grégory Gaultier,7.351065,0.0,0.0,0.0,0.0,0.0
Ahmed Barada,-11.087805,0.0,0.0,0.0,0.0,0.0
Rodney Eyles,-13.324423,0.0,0.0,0.0,0.0,0.0
Nick Matthew,7.184409,0.0,0.0,0.0,0.0,0.0


In [65]:
pd.concat([m, pd.DataFrame(pca.inverse_transform(m_pca1), index=m.index)], axis=1)

Unnamed: 0_level_0,average_rank,years_in_top10,best_rank,worst_rank,earliest_year,latest_year,0,1,2,3,4,5
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Peter Nicol,2.4,10,1,8,1996,2005,6.363585,4.065379,4.555544,8.395453,1999.328161,2002.713058
Ali Farag,3.25,4,1,7,2016,2019,6.597757,6.043323,4.353743,9.172439,2014.542482,2019.947985
Jansher Khan,3.333333,3,1,8,1996,1998,6.304607,3.567224,4.606368,8.199765,1995.496354,1998.372352
Jonathon Power,3.666667,9,1,9,1997,2005,6.36987,4.11847,4.550127,8.416309,1999.736535,2003.175669
Mohamed El Shorbagy,3.7,10,1,10,2010,2019,6.563944,5.757716,4.382882,9.060246,2012.345591,2017.459327
Ramy Ashour,3.909091,11,1,7,2006,2016,6.514368,5.338976,4.425604,8.895754,2009.124647,2013.810611
Grégory Gaultier,4.0,15,1,10,2003,2018,6.513969,5.335603,4.425948,8.894429,2009.098704,2013.781222
Ahmed Barada,4.25,4,2,7,1997,2000,6.326964,3.756058,4.587102,8.273944,1996.948868,2000.017773
Rodney Eyles,4.333333,3,2,7,1996,1998,6.30428,3.564461,4.60665,8.19868,1995.475105,1998.34828
Nick Matthew,4.642857,14,1,10,2004,2017,6.512279,5.321326,4.427405,8.888821,2008.98889,2013.656824


In [67]:
m_pca2 = m_pca.copy()
m_pca2[['pca3', 'pca4', 'pca5', 'pca6']] = 0
m_pca2

Unnamed: 0_level_0,pca1,pca2,pca3,pca4,pca5,pca6
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Peter Nicol,-7.476936,8.752985,0.0,0.0,0.0,0.0
Ali Farag,15.612668,0.402603,0.0,0.0,0.0,0.0
Jansher Khan,-13.292174,1.979958,0.0,0.0,0.0,0.0
Jonathon Power,-6.857177,7.364381,0.0,0.0,0.0,0.0
Mohamed El Shorbagy,12.278616,6.681417,0.0,0.0,0.0,0.0
Ramy Ashour,7.390437,7.97735,0.0,0.0,0.0,0.0
Grégory Gaultier,7.351065,12.398622,0.0,0.0,0.0,0.0
Ahmed Barada,-11.087805,1.99516,0.0,0.0,0.0,0.0
Rodney Eyles,-13.324423,1.167023,0.0,0.0,0.0,0.0
Nick Matthew,7.184409,10.885256,0.0,0.0,0.0,0.0


In [68]:
pd.concat([m, pd.DataFrame(pca.inverse_transform(m_pca2), index=m.index)], axis=1)

Unnamed: 0_level_0,average_rank,years_in_top10,best_rank,worst_rank,earliest_year,latest_year,0,1,2,3,4,5
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Peter Nicol,2.4,10,1,8,1996,2005,3.884388,9.786447,0.140165,8.616488,1995.814079,2005.130603
Ali Farag,3.25,4,1,7,2016,2019,6.483724,6.30647,4.150653,9.182606,2014.380847,2020.059183
Jansher Khan,3.333333,3,1,8,1996,1998,5.743804,4.861351,3.607593,8.249764,1994.701456,1998.919209
Jonathon Power,3.666667,9,1,9,1997,2005,4.283982,8.931928,0.83522,8.602278,1996.77994,2005.209686
Mohamed El Shorbagy,3.7,10,1,10,2010,2019,4.671498,10.124779,1.012491,9.228968,2009.663187,2019.304711
Ramy Ashour,3.909091,11,1,7,2006,2016,4.254862,10.553078,0.401489,9.097202,2005.921962,2016.013927
Grégory Gaultier,4.0,15,1,10,2003,2018,3.002181,13.439507,-1.828444,9.207526,2004.121,2017.205679
Ahmed Barada,4.25,4,2,7,1997,2000,5.761854,5.060122,3.580659,8.324327,1996.147866,2000.56883
Rodney Eyles,4.333333,3,2,7,1996,1998,5.973733,4.327243,4.017954,8.22815,1995.006577,1998.670608
Nick Matthew,4.642857,14,1,10,2004,2017,3.429137,12.436075,-1.063582,9.163701,2004.618761,2016.663295


In [69]:
pca.explained_variance_

array([118.81586177,  29.39876913,   4.86314221,   1.31442008,
         0.22146742,   0.18436197])

In [70]:
pca.explained_variance_ratio_

array([0.76755413, 0.18991696, 0.03141605, 0.00849119, 0.00143069,
       0.00119098])