In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler

In [2]:
# import custom stats preparing tools
import sys

sys.path.insert(1,'../utils')

from stats_preparing import *

## choosing a metric

Here we decided to study the variation of distance between different metric. We decided to use the metric that
output the greatest variation in hope to better isolate each player. 

In [3]:
# we use pandas to load data directly from csv
df = pd.read_csv('../data/adv_stats_95-96.csv')

# apply a couple of preprocessing function
df = remove_rk(remove_slash_name(df))
df = remove_age(remove_team(df))
df = remove_game(remove_min(df))

df = remove_nan(df)

df_old, _ = extract_name_position(df)

df = pd.read_csv('../data/adv_stats_18-19.csv')

df = remove_rk(remove_slash_name(df))
df = remove_age(remove_team(df))
df = remove_game(remove_min(df))

df = remove_nan(df)

df_current, _ = extract_name_position(df)

scaler = MinMaxScaler()

curr_values = scaler.fit_transform(df_current.values)
old_values = scaler.transform(df_old.values)

performance_metric = {}
for metric in ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'hamming', 'kulsinski', 'minkowski', 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath',
               'sqeuclidean']:
    neigh = NearestNeighbors(1, metric=metric)

    neigh.fit(curr_values)

    dst, ids = neigh.kneighbors(old_values)
    performance_metric[metric] = np.std(dst)

    
max_idx = max(performance_metric, key=performance_metric.get)
max_idx

'canberra'

In [6]:
def get_alike_player(name):
    # we use pandas to load data directly from csv
    df = pd.read_csv('../data/adv_stats_95-96.csv')

    # apply a couple of preprocessing function
    df = remove_rk(remove_slash_name(df))
    df = remove_age(remove_team(df))
    df = remove_game(remove_min(df))

    pos_used = ['PG','SG','SF','PF','C']
    df = df.loc[df['Pos'].isin(pos_used)]
    df = remove_nan(df)

    df_jordan, _ = extract_name_position(df.loc[df['Player'] == name])
    df_jordan

    df = pd.read_csv('../data/adv_stats_18-19.csv')

    df = remove_rk(remove_slash_name(df))
    df = remove_age(remove_team(df))
    df = remove_game(remove_min(df))

    pos_used = ['PG','SG','SF','PF','C']
    df = df.loc[df['Pos'].isin(pos_used)]
    df = remove_nan(df)

    df_current_player, df_current_names = extract_name_position(df)

    neigh = NearestNeighbors(5, metric=max_idx)

    scaler = MinMaxScaler()

    neigh.fit(scaler.fit_transform(df_current_player.values))

    dst, ids = neigh.kneighbors(scaler.transform(df_jordan.values))
    return df_current_names.iloc[ids[0]]

In [7]:
player_names = ['Michael Jordan', 'Charles Barkley', 'Shawn Bradley',
                'Muggsy Bogues', 'Patrick Ewing', 'Larry Johnson']
for p in player_names:
    print('Equivalent current player to {}'.format(p))
    print(get_alike_player(p))
    print()

Equivalent current player to Michael Jordan
                    Player Pos
240            Paul George  SF
397          Kawhi Leonard  SF
267           James Harden  PG
19   Giannis Antetokounmpo  PF
104           Jimmy Butler  SF

Equivalent current player to Charles Barkley
                 Player Pos
161       Anthony Davis   C
647  Karl-Anthony Towns   C
353        Nikola Jokic   C
661      Nikola Vucevic   C
554       Julius Randle  PF

Equivalent current player to Shawn Bradley
               Player Pos
448      JaVale McGee   C
36      Mohamed Bamba   C
502       Joakim Noah   C
51        Jordan Bell   C
678  Hassan Whiteside   C

Equivalent current player to Muggsy Bogues
                    Player Pos
109          Isaiah Canaan  PG
171    Matthew Dellavedova  PG
627         Caleb Swanigan  PF
17           Ryan Anderson  PF
496  Sviatoslav Mykhailiuk  SF

Equivalent current player to Patrick Ewing
                Player Pos
678   Hassan Whiteside   C
448       JaVale McGee   C
6

In [12]:
def get_alike_team(team, old='95-96', new='18-19'):
    # we use pandas to load data directly from csv
    df = pd.read_csv('../data/adv_stats_{}.csv'.format(old))

    # apply a couple of preprocessing function
    df = remove_rk(remove_slash_name(df))
    df = df.loc[df['Tm'] == team]
    df = remove_age(remove_team(df))
    df = remove_game(remove_min(df))

    pos_used = ['PG','SG','SF','PF','C']
    df = df.loc[df['Pos'].isin(pos_used)]
    df = remove_nan(df)

    df_old_team, df_old_names = extract_name_position(df)

    df = pd.read_csv('../data/adv_stats_{}.csv'.format(new))

    df = remove_rk(remove_slash_name(df))
    df = remove_age(remove_team(df))
    df = remove_game(remove_min(df))

    pos_used = ['PG','SG','SF','PF','C']
    df = df.loc[df['Pos'].isin(pos_used)]
    df = remove_nan(df)

    df_current_player, df_current_names = extract_name_position(df)
    neigh = NearestNeighbors(1, metric=max_idx)
    
    scaler = MinMaxScaler()
    neigh.fit(scaler.fit_transform(df_current_player.values))

    dst, ids = neigh.kneighbors(scaler.transform(df_old_team.values))
    result_df = pd.DataFrame({'{} players'.format(old) : (df_old_names.values)[:,0],
    '{} players'.format(new) : df_current_names.iloc[np.reshape(ids,-1)].values[:,0]})
    
    return result_df

In [13]:
new = '18-19'
old = '95-96'
result_df = get_alike_team('CHI', new=new, old=old)
print('old player and current equivalent :')
result_df

old player and current equivalent :


Unnamed: 0,95-96 players,18-19 players
0,Randy Brown,Shaquille Harrison
1,Jud Buechler,Delon Wright
2,Jason Caffey,Greg Monroe
3,James Edwards,Marquese Chriss
4,Jack Haley,Angel Delgado
5,Ron Harper,Delon Wright
6,Michael Jordan,Paul George
7,Steve Kerr,D.J. Augustin
8,Toni Kukoc,Pascal Siakam
9,Luc Longley,Ivica Zubac
