In [3815]:
import pandas as pd
import pandas_profiling

import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix


from sklearn.neural_network import MLPClassifier

"""
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
from sklearn.cluster import KMeans, SpectralClustering

import pydot
from IPython.display import Image

import time
from sklearn.manifold import TSNE
import seaborn as sb
sb.set(rc={'figure.figsize':(11.7,8.27)})
palette = sb.color_palette("bright", 10)
"""

# In order to see full screen (horizontal scrolling) dataframes
from IPython.display import display
pd.options.display.max_columns = None

# === === === === === === === === === ===
# Ziel / Problembeschreibung

Für den gewählten Datensatz soll versucht werden mittels Regressionsmethoden die Gehälter der NBA-Spieler vorherzusagen. Welche Features sind ausschlaggebend für ein hohes bzw. niedriges Gehalt?

Es soll ein Regressionsproblem gelöst werden, wodurch folgende Bereiche behandelt werden sollen:

- Linear Regression (Regression)
- KNN (Regression)
- Decision Tree (Regression)
- Neuronal Network (Regression)


- PCA (visualisieren)
- T-SNE (visualisieren)


- KMeans (CLustern)
- Spectral Clustering (Clustern)

TODO: Wie?

Dimensionality Reduction kannst du zum Visualisieren benutzen um visuell zu sehen, ob es Gruppierungen bei den Spielern gibt (Welche Features sind dabei wichtig? zb College oder Position).

Mit Clustering kannst du dann die Spieler in k Gruppen einteilen, abhängig von den sportlichen Werten (Wenn man ähnlich gute sportliche Werte hat, hat man dann auch ein ähnliches gutes Gehalt?

# === === === === === === === === === ===
# Daten

Die Datensätze __salaries__ und __players__ wurden aus folgender Quelle gewählt: https://data.world/datadavis/nba-salaries.

Der Datensatz __seasons_stats__ wurde aus folgender Quelle gewählt: https://www.kaggle.com/drgilermo/nba-players-stats

In [3648]:
salaries_df = pd.read_csv('data/salaries.csv')
players_df = pd.read_csv('data/players.csv')
seasons_stats_df = pd.read_csv('data/seasons_stats.csv')

# === === === === === === === === === ===
# Datenaufbereitung

## === === === === === === === === === ===
## Dataframe: Salaries

Diese Daten von https://data.world/datadavis/nba-salaries bilden den Ausgangspunkt für die beschriebene Fragestellung. Der Datensatz beinhaltet im Wesentlichen das ausbezahlte __Gehalt__ an einen Spieler für eine bestimmte __Saison__.

In [3649]:
salaries_df.sample()

Unnamed: 0,league,player_id,salary,season,season_end,season_start,team
3167,NBA,dayeda01,120000,1985-86,1986,1985,Washington Bullets


### Pandas Profiling Report

Der nachfolgende _Pandas Profiling Report_ verschafft einen vollständigen Überblick (Statistiken, Visualisierungen) über den importierten Datensatz.

In [3650]:
# salaries_profile_report = salaries_df.profile_report()
# salaries_profile_report.to_file(output_file="salaries_profile_report.html")

### Analyse der einzelnen Spalten

In [3651]:
salaries_df.columns

Index(['league', 'player_id', 'salary', 'season', 'season_end', 'season_start',
       'team'],
      dtype='object')

#### League

Laut dem _Pandas Profiling Report_ handelt es sich bei der Variable __league__ um eine Konstante. Alle Spieler des Datensatzes sind in der _NBA_ tätig. Das Feature schafft somit keinen Mehrwert und kann entfernt werden.

In [3652]:
salaries_df['league'].describe()

count     14163
unique        1
top         NBA
freq      14163
Name: league, dtype: object

In [3653]:
salaries_df.drop(columns=['league'], inplace=True)

#### Player ID

Die Variable __player_id__ kann zunächst im Datensatz gelassen werden. Über diese ID können später die zusätzlichen Daten des Spielers vom Datensatz __Players__ eingebunden werden.

#### Salary

Die wichtigste Variable __salary__ wird natürlich im Datensatz behalten.

#### Season (Start, End)

Die Variablen __season__, __season_start__ und __season_end__ müssen nicht alle im Datensatz belassen werden. Die Variable __season__ kann aus den beiden anderen konstruiert werden und ist somit redundant. Sie kann somit entfernt werden.

In [3654]:
salaries_df.drop(columns=['season'], inplace=True)

### Ergebnis

In [3655]:
salaries_df.head()

Unnamed: 0,player_id,salary,season_end,season_start,team
0,abdelal01,395000,1991,1990,Portland Trail Blazers
1,abdelal01,494000,1992,1991,Portland Trail Blazers
2,abdelal01,500000,1993,1992,Boston Celtics
3,abdelal01,805000,1994,1993,Boston Celtics
4,abdelal01,650000,1995,1994,Sacramento Kings


In [3656]:
salaries_df.tail()

Unnamed: 0,player_id,salary,season_end,season_start,team
14158,zipsepa01,750000,2017,2016,Chicago Bulls
14159,zipsepa01,1312611,2018,2017,Chicago Bulls
14160,zizican01,1645200,2018,2017,Cleveland Cavaliers
14161,zubaciv01,1034956,2017,2016,Los Angeles Lakers
14162,zubaciv01,1312611,2018,2017,Los Angeles Lakers


In [3657]:
salaries_df.sample(7)

Unnamed: 0,player_id,salary,season_end,season_start,team
9794,parksch02,971000,1996,1995,Dallas Mavericks
4697,gomesry01,4000000,2011,2010,Los Angeles Clippers
848,barryjo01,650000,1994,1993,Milwaukee Bucks
10548,redicjj01,7250000,2011,2010,Orlando Magic
1406,boshch01,3348000,2006,2005,Toronto Raptors
5221,harklma01,1731960,2013,2012,Orlando Magic
2706,covinro01,1015696,2017,2016,Philadelphia 76ers


In [3658]:
salaries_df.describe()

Unnamed: 0,salary,season_end,season_start
count,14163.0,14163.0,14163.0
mean,3164870.0,2003.66942,2002.66942
std,4185046.0,9.16469,9.16469
min,2706.0,1985.0,1984.0
25%,630000.0,1996.0,1995.0
50%,1500000.0,2004.0,2003.0
75%,3884239.0,2012.0,2011.0
max,34682550.0,2018.0,2017.0


In [3659]:
salaries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14163 entries, 0 to 14162
Data columns (total 5 columns):
player_id       14163 non-null object
salary          14163 non-null int64
season_end      14163 non-null int64
season_start    14163 non-null int64
team            14159 non-null object
dtypes: int64(3), object(2)
memory usage: 553.4+ KB


## === === === === === === === === === ===
## Dataframe: Players

Die Daten von https://data.world/datadavis/nba-salaries komplementieren den zuvor beschriebenen __Salaries__ Datensatz. Sie beinhalten im Wesentlichen __Informationen zu den jeweiligen Spielern__.

In [3660]:
players_df.sample()

Unnamed: 0,_id,birthDate,birthPlace,career_AST,career_FG%,career_FG3%,career_FT%,career_G,career_PER,career_PTS,career_TRB,career_WS,career_eFG%,college,draft_pick,draft_round,draft_team,draft_year,height,highSchool,name,position,shoots,weight
3091,oboyljo01,"March 7, 1928",,1.0,30.8,,71.4,5,3.4,4.2,2.0,0.0,,Colorado State University,,,,,6-2,,John O'Boyle,Shooting Guard,Right,185lb


### Pandas Profiling Report

Der nachfolgende _Pandas Profiling Report_ verschafft einen vollständigen Überblick (Statistiken, Visualisierungen) über den importierten Datensatz.

In [3661]:
# players_profile_report = players_df.profile_report()
# players_profile_report.to_file(output_file="players_profile_report.html")

### Analyse der einzelnen Spalten

In [3662]:
players_df.columns

Index(['_id', 'birthDate', 'birthPlace', 'career_AST', 'career_FG%',
       'career_FG3%', 'career_FT%', 'career_G', 'career_PER', 'career_PTS',
       'career_TRB', 'career_WS', 'career_eFG%', 'college', 'draft_pick',
       'draft_round', 'draft_team', 'draft_year', 'height', 'highSchool',
       'name', 'position', 'shoots', 'weight'],
      dtype='object')

#### ID

Die Variable __id__ kann zunächst im Datensatz belassen werden. Mit dieser Variable kann nämlich später das Matching mit dem __Salaries__ Datensatz vollzogen werden.

#### Birth Date

Laut dem _Pandas Profiling Report_ weißt die Variable __birthDate__ folgende Probleme auf:

- Hohe Kardinalität
- Uniform

Außerdem hat die Variable __28__ fehlende Datenpunkte.

Prinzipiell könnten die Features _birth_month_, _birth_day_ und _birth_year_ aus der genannten Variable entnommen werden. Dies scheint auf den ersten Blick jedoch keinen wirklichen Mehrwert zu bieten. Somit wird dieses Feature verworfen.

In [3663]:
players_df.drop(columns=['birthDate'], inplace=True)

#### Birth Place

Laut dem _Pandas Profiling Report_ weißt die Variable __birthPlace__ folgende Probleme auf:

- Fehlende Datenpunkte || 10.7%
- Hohe Kardinalität || 39%


##### Fehlende Datenpunkte

Die fehlenden Daten werden mit dem Top-Wert aufgefüllt.

In [3664]:
players_df['birthPlace'].describe()

count                  4185
unique                 1632
top       Chicago, Illinois
freq                    137
Name: birthPlace, dtype: object

In [3665]:
players_df['birthPlace'] = players_df['birthPlace'].fillna("Chicago, Illinois")

##### Hohe Kardinalität

Um dieses Problem zu lösen soll nur das Land in Betracht gezogen werden. Eine drastischere Maßnahme wäre die reine Verwendung des Features __born_in_usa__. Diese Variable wurde nur Aufschluss darüber geben, ob der Spieler in der USA geboren wurde oder nicht.

In [3666]:
players_df['birthPlace'].describe()

count                  4685
unique                 1632
top       Chicago, Illinois
freq                    637
Name: birthPlace, dtype: object

Wie man oberhalb erkennen kann weißt das Feature 1632 einmalige Werte auf.

In [3667]:
def generalizeBirthPlace(birthPlace):
    splitted =  birthPlace.split(",")
    
    return splitted[1] if len(splitted) >= 2 else ",".join(splitted)

In [3668]:
def statesToUSA(birthPlace):
    states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
              "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
              "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
              "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah","Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]
    
    def process(string):
        return string.lower().strip()
        
    states = map(process, states)
    
    return "USA" if process(birthPlace) in states else birthPlace

In [3669]:
players_df['birthPlace'] = players_df['birthPlace'].apply(generalizeBirthPlace)

In [3670]:
players_df['birthPlace'] = players_df['birthPlace'].apply(statesToUSA)

In [3671]:
players_df.rename(columns={"birthPlace": "birth_country"}, inplace=True)

In [3672]:
players_df['birth_country'].sample(7)

4216                      USA
3615                      USA
2337                      USA
3800                      USA
4584                      USA
1616                      USA
3450     District of Columbia
Name: birth_country, dtype: object

In [3673]:
players_df['birth_country'].describe()

count     4685
unique      90
top        USA
freq      4160
Name: birth_country, dtype: object

Durch die reine Verwendung des Landes lassen sich 1632 einmalige Werte auf 90 reduzieren.

In [3674]:
players_df['born_in_usa'] = np.where(players_df['birth_country'] == 'USA', True, False)

In [3675]:
players_df['born_in_usa'].describe()

count     4685
unique       2
top       True
freq      4160
Name: born_in_usa, dtype: object

#### Career Stats

Alle Variablen die Statistiken zu der gesamten Karriere des Spielers beinhalten können verworfen werden. Diese Features würden keinen Mehrwert für das aktuelle Gehalt in dieser Saison bieten. Stattdessen sollen im weiteren Verlauf die jeweiligen Saisonstatistiken des Spielers durch einen weiteren Datensatz erweitert werden.

In [3676]:
players_df.drop(columns=['career_AST', 'career_FG%', 'career_FG3%', 'career_FT%', 'career_G', 'career_PER', 'career_PTS', 'career_TRB', 'career_WS', 'career_eFG%'], inplace=True)

#### College

Laut dem _Pandas Profiling Report_ weißt die Variable __college__ folgende Probleme auf:

- Fehlende Datenpunkte || 6.7%
- Hohe Kardinalität || 16.7%

##### Fehlende Datenpunkte

Das Problem der fehlenden Datenpunkte wird durch folgende Annahme behoben: Fehlende Datenpunkte repräsentieren Spieler, die kein College besucht haben. _LeBron James_ hat beispielsweise kein College eingetragen und auch tatsächlich kein College besucht. Deshalb wird der Wert _NO_COLLEGE_ bei diesen Spielern hinzugefügt.

In [3677]:
players_df['college'] = players_df['college'].fillna('NO_COLLEGE')

In [3678]:
players_df['college'].describe()

count           4685
unique           732
top       NO_COLLEGE
freq             313
Name: college, dtype: object

##### Hohe Kardinalität

Das Problem der hohen Kardinalität scheint derzeit nicht lösbar zu sein. Eine drastische Maßnahme wäre ausschließlich das Feature __attended_college__ aufzunehmen.

In [3679]:
players_df['attended_college'] = np.where(players_df['college'] == 'NO_COLLEGE', False, True)

In [3680]:
players_df['attended_college'].describe()

count     4685
unique       2
top       True
freq      4372
Name: attended_college, dtype: object

#### Draft

Laut dem _Pandas Profiling Report_ weißen die Features __draft_pick__, __draft_round__, __draft_team__, __draft_year__ folgende Probleme auf:

- Hohe Kardinalität
- Fehlende Datenpunkte

Diese Probleme sollen durch die Extrahierung eines neuen Features __drafted_player__ behoben werden. Die ursprünglichen Features werden verworfen. Das neue Feature beruht auf der Annahme, dass es sich bei diesen Spielern um _undrafted players_ handelt. Da es diese Kategorie im Datensatz nicht gibt und diese Art von Spielern jedoch sehrwohl existieren wird in dieser Analyse von diesem Umstand ausgegangen.

In [3681]:
players_df['drafted_player'] = np.where(
    players_df['draft_pick'].isna() |
    players_df['draft_round'].isna() |
    players_df['draft_team'].isna() |
    players_df['draft_year'].isna(), True, False
)

In [3682]:
players_df['drafted_player'].describe()

count      4685
unique        2
top       False
freq       3306
Name: drafted_player, dtype: object

Die originalen Features werden nicht benötigt und können vom Datensatz entfernt werden.

In [3683]:
players_df.drop(columns=['draft_pick', 'draft_round', 'draft_team', 'draft_year'], inplace=True)

#### Height

Bei der Variable __height__ handelt es sich derzeit um kategorische Daten. Die Größe soll daher in eine numerische Repräsentation umgewandelt werden. Im originalen Datensatz handelt es sich um eine Foot-Inch Darstellung. Daher wird die Größe auf Inches umgerechnet.

In [3684]:
def heightToInches(height):
    foot_and_inches = height.split('-')
    foot_in_inches = int(foot_and_inches[0]) * 12
    inches = int(foot_and_inches[1])
    
    return foot_in_inches + inches

In [3685]:
players_df['height'].describe()

count     4685
unique      28
top        6-7
freq       486
Name: height, dtype: object

In [3686]:
players_df['height'] = players_df['height'].apply(heightToInches)

In [3687]:
players_df.rename(columns={"height": "height_in_inches"}, inplace=True)

In [3688]:
players_df['height_in_inches'].sample(7)

2977    79
4070    80
2798    75
1031    79
582     74
351     76
1283    77
Name: height_in_inches, dtype: int64

#### High School

Die Variable __highSchool__ weißt Ähnlichkeiten mit dem Feature __college__ auf: 

- Hohe Kardinalität || 67.3%
- Fehlende Datenpunkte || 14.1%

Da dieses Feature eine wirkliche hohe Kardinalität besitzt soll ausschließlich das Feature __attended_high_school__ extrahiert werden.

In [3689]:
players_df['highSchool'].describe()

count                                              4025
unique                                             2708
top       Oak Hill Academy in Mouth of Wilson, Virginia
freq                                                 29
Name: highSchool, dtype: object

In [3690]:
players_df['highSchool'] = players_df['highSchool'].isna()

In [3691]:
players_df.rename(columns={"highSchool": "attended_high_school"}, inplace=True)

In [3692]:
players_df['attended_high_school'].sample(7)

2905    False
32      False
3503    False
3060     True
2874    False
3030    False
2784    False
Name: attended_high_school, dtype: bool

#### Name

Laut dem _Pandas Profiling Report_ weißt die Variable __name__ folgende Probleme auf:

- Hohe Kardinalität || 99%
- Uniform 

Möglicherweise könnte man __name_length__ aus dem genannten Feature extrahieren, jedoch besteht die Vermutung, dass der Name keinen großen Einfluss auf das Gehalt der Spieler hat. Deshalb soll das Feature aus dem Datensatz entfernt werden. Vorerst wird es jedoch beibehalten, da es später für ein Matching mit einem anderen Datensatz verwendet wird.

#### Position

Laut dem _Pandas Profiling Report_ weißt die Variable _position_ keine hohe Kardinalität auf. Dennoch wird versucht die möglichen Ausprägungen etwas einzuschränken. Spieler denen mehrer Positionen zugeordnet sind bekommen einen dementsprechenden Eintrag. Eine drastischere Vorgehensweise wäre die reine Verwendung des Features __multiple_positions__.

In [3693]:
def processMultiplePositions(position):
    return "MULTIPLE_POSITIONS" if "and" in position.split() else position

In [3694]:
players_df['position'].describe()

count               4685
unique                43
top       Shooting Guard
freq                 679
Name: position, dtype: object

In [3695]:
players_df['position'] = players_df['position'].apply(processMultiplePositions)

In [3696]:
players_df['position'].describe()

count                   4685
unique                    12
top       MULTIPLE_POSITIONS
freq                    1208
Name: position, dtype: object

In [3697]:
players_df['position'].sample(7)

2473         Small Forward
3552                Center
430         Shooting Guard
3685           Point Guard
1935    MULTIPLE_POSITIONS
2179         Power Forward
197          Power Forward
Name: position, dtype: object

In [3698]:
players_df['multiple_positions'] = np.where(players_df['position'] == "MULTIPLE_POSITIONS", True, False)

In [3699]:
players_df['multiple_positions'].describe()

count      4685
unique        2
top       False
freq       3477
Name: multiple_positions, dtype: object

#### Shoots

Die Variable __shoots__ kann vorerst ohne weitere Bearbeitung übernommen werden.

#### Weight

Laut dem _Pandas Profiling Report_ weißt die Variable __weight__ folgendes Problem auf:

- Hohe Kardinalität || 3.1%

Derzeit handelt es sich beim Gewicht um einen kategorischen Wert. Das Gewicht soll daher in eine numerische Repräsentation umgewandelt werden. Somit wird die Einheit wird entfernt und der Datentyp angepasst.

##### Fehlende Datenpunkte

Laut dem _Pandas Profiling Report_ hat diese Variable __5__ fehlende Datenpunkte. Diese werden durch den Top-Wert aufgefüllt.

In [3700]:
players_df['weight'].describe()

count      4680
unique      143
top       210lb
freq        334
Name: weight, dtype: object

In [3701]:
players_df['weight'] = players_df['weight'].fillna('210lb')

##### Wertumwandlung

In [3702]:
players_df['weight'].describe()

count      4685
unique      143
top       210lb
freq        339
Name: weight, dtype: object

In [3703]:
def removeLB(weight):
    return weight[:-2]

In [3704]:
players_df['weight'] = players_df['weight'].apply(removeLB)

In [3705]:
players_df['weight'] = players_df['weight'].astype(int)

In [3706]:
players_df['weight'].sample(7)

1445    200
3040    200
4070    220
2555    200
3686    255
1158    200
3689    225
Name: weight, dtype: int64

### Ergebnis

In [3707]:
players_df.head()

Unnamed: 0,_id,birth_country,college,height_in_inches,attended_high_school,name,position,shoots,weight,born_in_usa,attended_college,drafted_player,multiple_positions
0,abdelal01,Egypt,Duke University,82,False,Alaa Abdelnaby,Power Forward,Right,240,False,True,False,False
1,abdulza01,USA,Iowa State University,81,False,Zaid Abdul-Aziz,MULTIPLE_POSITIONS,Right,235,True,True,False,True
2,abdulka01,USA,"University of California, Los Angeles",86,False,Kareem Abdul-Jabbar,Center,Right,225,True,True,False,False
3,abdulma02,USA,Louisiana State University,73,False,Mahmoud Abdul-Rauf,Point Guard,Right,162,True,True,False,False
4,abdulta01,France,"University of Michigan, San Jose State University",78,False,Tariq Abdul-Wahad,Shooting Guard,Right,223,False,True,False,False


In [3708]:
players_df.tail()

Unnamed: 0,_id,birth_country,college,height_in_inches,attended_high_school,name,position,shoots,weight,born_in_usa,attended_college,drafted_player,multiple_positions
4680,zizican01,Croatia,NO_COLLEGE,83,True,Ante Zizic,Center,Right,254,False,False,False,False
4681,zoetji01,Canada,Kent State University,85,False,Jim Zoet,Center,Right,240,False,True,True,False
4682,zopfbi01,USA,Duquesne University,73,False,Bill Zopf,Point Guard,Right,170,True,True,False,False
4683,zubaciv01,Bosnia and Herzegovina,NO_COLLEGE,85,True,Ivica Zubac,Center,Right,240,False,False,False,False
4684,zunicma01,USA,George Washington University,75,True,Matt Zunic,Guard/Forward,Right,195,True,True,True,False


In [3709]:
players_df.sample(7)

Unnamed: 0,_id,birth_country,college,height_in_inches,attended_high_school,name,position,shoots,weight,born_in_usa,attended_college,drafted_player,multiple_positions
1303,floydsl01,USA,Georgetown University,75,False,Sleepy Floyd,MULTIPLE_POSITIONS,Right,170,True,True,False,True
2241,kennedj01,USA,St. John's University,78,False,D.J. Kennedy,Small Forward,Left,215,True,True,True,False
129,armstbj01,USA,University of Iowa,74,False,B.J. Armstrong,Point Guard,Right,175,True,True,False,False
4244,vanzade01,USA,Azusa Pacific University,81,False,Dennis Van Zant,Power Forward,Right,210,True,True,False,False
2520,lopezfe01,Dominican Republic,St. John's University,77,False,Felipe Lopez,MULTIPLE_POSITIONS,Right,199,False,True,False,True
3990,stoudsa01,USA,University of Arizona,73,False,Salim Stoudamire,MULTIPLE_POSITIONS,Left,179,True,True,False,True
1016,derrima01,District of Columbia,Georgetown University,79,False,Marcus Derrickson,Power Forward,Right,249,False,True,True,False


In [3710]:
players_df.describe()

Unnamed: 0,height_in_inches,weight
count,4685.0,4685.0
mean,78.032657,209.056564
std,3.632334,26.125355
min,63.0,114.0
25%,75.0,190.0
50%,78.0,210.0
75%,81.0,225.0
max,91.0,360.0


In [3711]:
players_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4685 entries, 0 to 4684
Data columns (total 13 columns):
_id                     4685 non-null object
birth_country           4685 non-null object
college                 4685 non-null object
height_in_inches        4685 non-null int64
attended_high_school    4685 non-null bool
name                    4685 non-null object
position                4685 non-null object
shoots                  4684 non-null object
weight                  4685 non-null int64
born_in_usa             4685 non-null bool
attended_college        4685 non-null bool
drafted_player          4685 non-null bool
multiple_positions      4685 non-null bool
dtypes: bool(5), int64(2), object(6)
memory usage: 315.8+ KB


## === === === === === === === === === ===
## Dataframe: Seasons Stats

Die Daten von https://www.kaggle.com/drgilermo/nba-players-stats sollen als Erweiterung des __Salaries__ Datensatz fungieren. Wie bereits erwähnt, sind die _Career Stats_ des __Players__ Datensatzes eher unbrauchbar. Es macht vermutlich mehr Sinn, dass jeweilige Gehalt in einer Saison durch die entsprechenden _seasons_stats_ zu erweitern. Dies soll durch diesen Datensatz ermöglicht werden. Er beinhaltet im Wesentlichen __Statistiken von einem Spieler in einer Saison__.

In [3712]:
seasons_stats_df.sample()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
10259,10259,1991.0,Travis Mays,PG,22.0,SAC,64.0,55.0,2145.0,12.3,0.526,0.272,0.457,2.8,6.7,4.7,18.3,1.9,0.3,15.5,21.0,,0.8,1.3,2.1,0.047,,-0.1,-1.4,-1.5,0.2,294.0,724.0,0.406,72.0,197.0,0.365,222.0,527.0,0.421,0.456,255.0,331.0,0.77,54.0,124.0,178.0,253.0,81.0,11.0,159.0,169.0,915.0


### Analyse der einzelnen Spalten

In [3713]:
seasons_stats_df.columns

Index(['Unnamed: 0', 'Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP',
       'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
       'BLK%', 'TOV%', 'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2',
       'OBPM', 'DBPM', 'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%',
       '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

#### Unnamed

Die erste Variable __unnamed__ scheint einen Index darzustellen. Sie kann ohne weitere Bedenken entfernt werden.

In [3714]:
seasons_stats_df.drop(seasons_stats_df.columns[0], axis=1, inplace=True)

#### Year und Player

Die Variablen __year__ und __player__ sollen für das Matching mit den anderen Datensätzen verwendet werden. Die Variable __year__ kann mit __season_end__ des __Salaries__ Datensatzes verbunden werden.

#### Position

Dieser Datensatz beinhaltet ebenfalls eine Variable __Pos__ (Position). Diese kennzeichnet jedoch die Position in dieser Saison. Daher wird dieses feature und nicht das Feature des anderen Datensatzes verwendet. Die erstellten Features des anderen Datensatzes können somit zunächst verworfen werden.

In [3715]:
seasons_stats_df['Pos'].describe()

count     24624
unique       23
top          PF
freq       4966
Name: Pos, dtype: object

In [3716]:
players_df.drop(columns=['position', 'multiple_positions'], inplace=True)

#### Age

Das Alter des jeweiligen Spielers scheint auf jeden Fall Sinn zu machen. Daher wird dieses Feature behalten.

In [3717]:
seasons_stats_df['Age'].describe()

count    24616.000000
mean        26.664405
std          3.841892
min         18.000000
25%         24.000000
50%         26.000000
75%         29.000000
max         44.000000
Name: Age, dtype: float64

#### Saisonstatistiken

Alle Saisonstatistiken des Spielers sollen natürlich behalten werden. Der Datensatz soll genau diese Erweiterung an Daten liefern.

### Ergebnis

In [3718]:
seasons_stats_df.head()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,0.368,,0.467,,,,,,,,,,-0.1,3.6,3.5,,,,,,,144.0,516.0,0.279,,,,144.0,516.0,0.279,0.279,170.0,241.0,0.705,,,,176.0,,,,217.0,458.0
1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,0.435,,0.387,,,,,,,,,,1.6,0.6,2.2,,,,,,,102.0,274.0,0.372,,,,102.0,274.0,0.372,0.372,75.0,106.0,0.708,,,,109.0,,,,99.0,279.0
2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,0.394,,0.259,,,,,,,,,,0.9,2.8,3.6,,,,,,,174.0,499.0,0.349,,,,174.0,499.0,0.349,0.349,90.0,129.0,0.698,,,,140.0,,,,192.0,438.0
3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,0.312,,0.395,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,22.0,86.0,0.256,,,,22.0,86.0,0.256,0.256,19.0,34.0,0.559,,,,20.0,,,,29.0,63.0
4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,0.308,,0.378,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,21.0,82.0,0.256,,,,21.0,82.0,0.256,0.256,17.0,31.0,0.548,,,,20.0,,,,27.0,59.0


In [3719]:
seasons_stats_df.tail()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
24686,2017.0,Cody Zeller,PF,24.0,CHO,62.0,58.0,1725.0,16.7,0.604,0.002,0.442,8.6,17.3,12.9,9.1,1.8,3.0,10.9,15.5,,3.4,2.2,5.6,0.157,,-0.2,2.3,2.1,1.8,253.0,443.0,0.571,0.0,1.0,0.0,253.0,442.0,0.572,0.571,133.0,196.0,0.679,135.0,270.0,405.0,99.0,62.0,58.0,65.0,189.0,639.0
24687,2017.0,Tyler Zeller,C,27.0,BOS,51.0,5.0,525.0,13.0,0.508,0.006,0.247,9.2,17.0,13.2,12.2,0.7,3.3,10.2,16.5,,0.5,0.6,1.0,0.094,,-3.2,0.8,-2.5,-0.1,78.0,158.0,0.494,0.0,1.0,0.0,78.0,157.0,0.497,0.494,22.0,39.0,0.564,43.0,81.0,124.0,42.0,7.0,21.0,20.0,61.0,178.0
24688,2017.0,Stephen Zimmerman,C,20.0,ORL,19.0,0.0,108.0,7.3,0.346,0.0,0.161,10.8,24.9,17.6,5.3,0.9,3.7,8.3,14.8,,-0.1,0.1,0.0,-0.005,,-7.8,0.4,-7.3,-0.1,10.0,31.0,0.323,0.0,0.0,,10.0,31.0,0.323,0.323,3.0,5.0,0.6,11.0,24.0,35.0,4.0,2.0,5.0,3.0,17.0,23.0
24689,2017.0,Paul Zipser,SF,22.0,CHI,44.0,18.0,843.0,6.9,0.503,0.448,0.181,1.9,14.2,8.0,6.1,0.9,1.5,14.4,14.4,,-0.3,0.8,0.5,0.03,,-3.6,-0.1,-3.7,-0.4,88.0,221.0,0.398,33.0,99.0,0.333,55.0,122.0,0.451,0.473,31.0,40.0,0.775,15.0,110.0,125.0,36.0,15.0,16.0,40.0,78.0,240.0
24690,2017.0,Ivica Zubac,C,19.0,LAL,38.0,11.0,609.0,17.0,0.547,0.013,0.206,7.1,21.9,14.3,8.1,1.1,4.4,10.4,20.3,,0.6,0.5,1.1,0.086,,-2.7,0.3,-2.5,-0.1,126.0,238.0,0.529,0.0,3.0,0.0,126.0,235.0,0.536,0.529,32.0,49.0,0.653,41.0,118.0,159.0,30.0,14.0,33.0,30.0,66.0,284.0


In [3720]:
seasons_stats_df.sample(7)

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
18862,2008.0,Antonio Daniels,PG,32.0,WAS,71.0,63.0,2161.0,13.7,0.549,0.166,0.49,1.2,10.0,5.5,24.0,1.7,0.1,14.3,13.5,,3.5,1.0,4.5,0.099,,0.0,-1.2,-1.2,0.4,205.0,447.0,0.459,17.0,74.0,0.23,188.0,373.0,0.504,0.478,170.0,219.0,0.776,23.0,180.0,203.0,340.0,69.0,2.0,91.0,78.0,597.0
17377,2005.0,Nenad Krstic,PF,21.0,NJN,75.0,57.0,1965.0,13.4,0.547,0.004,0.447,9.7,14.9,12.3,7.1,0.9,2.6,14.1,19.0,,1.9,2.5,4.4,0.108,,-2.1,0.5,-1.6,0.2,281.0,570.0,0.493,0.0,2.0,0.0,281.0,568.0,0.495,0.493,185.0,255.0,0.725,161.0,240.0,401.0,77.0,32.0,63.0,112.0,280.0,747.0
21004,2011.0,Garret Siler,C,24.0,PHO,21.0,0.0,101.0,13.5,0.553,0.0,0.71,18.3,13.7,16.0,4.7,0.5,2.9,19.7,22.6,,0.1,0.1,0.1,0.054,,-2.4,-1.8,-4.2,-0.1,17.0,31.0,0.548,0.0,0.0,,17.0,31.0,0.548,0.548,11.0,22.0,0.5,16.0,12.0,28.0,3.0,1.0,4.0,10.0,20.0,45.0
6691,1982.0,Paul Mokeski,C,25.0,TOT,67.0,4.0,868.0,8.7,0.489,0.016,0.326,7.2,19.9,13.3,5.1,1.8,2.6,19.9,12.7,,-0.2,1.0,0.8,0.045,,-4.2,1.4,-2.8,-0.2,84.0,193.0,0.435,0.0,3.0,0.0,84.0,190.0,0.442,0.435,48.0,63.0,0.762,59.0,149.0,208.0,35.0,33.0,40.0,55.0,171.0,216.0
12355,1996.0,Mitchell Butler,SG,25.0,WSB,61.0,3.0,858.0,7.4,0.446,0.262,0.362,4.1,12.0,8.1,11.1,2.4,1.0,20.1,17.3,,-1.2,1.0,-0.2,-0.01,,-4.2,0.5,-3.7,-0.4,88.0,229.0,0.384,13.0,60.0,0.217,75.0,169.0,0.444,0.413,48.0,83.0,0.578,29.0,89.0,118.0,67.0,41.0,12.0,67.0,104.0,237.0
423,1951.0,Dick Mehen,PF,28.0,TOT,66.0,,,,0.404,,0.231,,,,,,,,,,0.5,1.5,1.9,,,,,,,192.0,532.0,0.361,,,,192.0,532.0,0.361,0.361,90.0,123.0,0.732,,,223.0,118.0,,,,149.0,474.0
17094,2005.0,Gilbert Arenas,PG,23.0,WAS,80.0,80.0,3274.0,21.3,0.565,0.369,0.42,2.8,10.5,6.5,22.9,2.2,0.5,11.8,27.3,,9.2,2.3,11.5,0.169,,5.5,-1.5,4.0,4.9,656.0,1523.0,0.431,205.0,562.0,0.365,451.0,961.0,0.469,0.498,521.0,640.0,0.814,83.0,295.0,378.0,411.0,139.0,23.0,242.0,245.0,2038.0


In [3721]:
seasons_stats_df.describe()

Unnamed: 0,Year,Age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
count,24624.0,24616.0,24624.0,18233.0,24138.0,24101.0,24538.0,18839.0,24525.0,20792.0,20792.0,21571.0,22555.0,20792.0,20792.0,19582.0,19640.0,0.0,24585.0,24585.0,24585.0,24101.0,0.0,20797.0,20797.0,20797.0,20797.0,24624.0,24624.0,24525.0,18927.0,18927.0,15416.0,24624.0,24624.0,24496.0,24525.0,24624.0,24624.0,23766.0,20797.0,20797.0,24312.0,24624.0,20797.0,20797.0,19645.0,24624.0,24624.0
mean,1992.594989,26.664405,50.83711,23.593375,1209.720317,12.479071,0.493001,0.158604,0.325455,6.181565,13.708657,9.94921,13.009962,1.648269,1.410624,15.085099,18.906492,,1.257307,1.227395,2.485796,0.065002,,-1.778386,-0.54857,-2.32672,0.559879,195.32582,430.645752,0.430817,22.215037,63.60448,0.248796,178.250447,381.756782,0.445343,0.450658,102.389336,136.775219,0.719279,62.18921,147.199404,224.637381,114.852623,39.897052,24.47026,73.939832,116.339222,510.11635
std,17.429594,3.841892,26.496161,28.632387,941.146575,6.039014,0.094469,0.187495,0.218971,4.872685,6.636402,5.040283,9.191843,1.017024,1.773348,6.91917,5.448157,,2.136256,1.269613,3.058638,0.102471,,3.792947,2.253106,4.691619,1.336892,188.114361,397.624715,0.095921,38.543366,102.442769,0.176683,179.478923,371.260335,0.099803,0.0992,113.373565,146.078918,0.141824,67.324881,145.921912,228.190203,135.863913,38.713053,36.935084,67.713803,84.791873,492.922981
min,1950.0,18.0,1.0,0.0,0.0,-90.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,-5.1,-1.0,-2.8,-2.519,,-73.8,-30.4,-86.7,-2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1981.0,24.0,27.0,0.0,340.0,9.8,0.458,0.005,0.208,2.6,8.8,5.9,6.5,1.1,0.3,11.4,15.4,,-0.1,0.2,0.2,0.031,,-3.4,-1.7,-4.2,-0.2,41.0,99.0,0.393,0.0,1.0,0.1,35.0,82.0,0.407,0.414,18.0,27.0,0.657,12.0,33.0,51.0,19.0,9.0,3.0,18.0,39.0,106.0
50%,1996.0,26.0,58.0,8.0,1053.0,12.7,0.506,0.064,0.296,5.4,12.7,9.2,10.5,1.5,0.9,14.2,18.6,,0.4,0.8,1.4,0.075,,-1.5,-0.5,-1.8,0.0,141.0,321.0,0.439,2.0,11.0,0.292,122.0,270.0,0.456,0.463,63.0,88.0,0.743,38.0,106.0,159.0,68.0,29.0,11.0,55.0,109.0,364.0
75%,2007.0,29.0,75.0,45.0,1971.0,15.6,0.544,0.288,0.4,9.0,18.1,13.5,17.6,2.1,1.9,17.7,22.2,,1.9,1.8,3.8,0.115,,0.3,0.7,0.3,0.9,299.0,661.0,0.48,27.0,84.0,0.363,268.0,579.25,0.496,0.501,149.0,201.0,0.808,91.0,212.0,322.0,160.0,60.0,29.0,112.0,182.0,778.0
max,2017.0,44.0,88.0,83.0,3882.0,129.1,1.136,1.0,6.0,100.0,100.0,100.0,100.0,24.2,77.8,100.0,100.0,,18.3,16.0,25.4,2.123,,47.8,46.8,36.2,12.4,1597.0,3159.0,1.0,402.0,886.0,1.0,1597.0,3159.0,1.0,1.5,840.0,1363.0,1.0,587.0,1111.0,2149.0,1164.0,301.0,456.0,464.0,386.0,4029.0


In [3722]:
seasons_stats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24691 entries, 0 to 24690
Data columns (total 52 columns):
Year      24624 non-null float64
Player    24624 non-null object
Pos       24624 non-null object
Age       24616 non-null float64
Tm        24624 non-null object
G         24624 non-null float64
GS        18233 non-null float64
MP        24138 non-null float64
PER       24101 non-null float64
TS%       24538 non-null float64
3PAr      18839 non-null float64
FTr       24525 non-null float64
ORB%      20792 non-null float64
DRB%      20792 non-null float64
TRB%      21571 non-null float64
AST%      22555 non-null float64
STL%      20792 non-null float64
BLK%      20792 non-null float64
TOV%      19582 non-null float64
USG%      19640 non-null float64
blanl     0 non-null float64
OWS       24585 non-null float64
DWS       24585 non-null float64
WS        24585 non-null float64
WS/48     24101 non-null float64
blank2    0 non-null float64
OBPM      20797 non-null float64
DBPM      2

## === === === === === === === === === ===
## Matching Dataframes

### Seasons Stats weiter aufräumen

Es gibt anscheinend Spieler, die innerhalb einer Saison bei mehreren Vereinen gespielt haben. In diesem Fall wird nur der letzte Aufenthalt des Spielers (der letzte Eintrag) berücksichtigt.

In [3723]:
seasons_stats_df.loc[seasons_stats_df['Player'] == "Ed Bartels"]

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,0.312,,0.395,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,22.0,86.0,0.256,,,,22.0,86.0,0.256,0.256,19.0,34.0,0.559,,,,20.0,,,,29.0,63.0
4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,0.308,,0.378,,,,,,,,,,-0.5,-0.1,-0.6,,,,,,,21.0,82.0,0.256,,,,21.0,82.0,0.256,0.256,17.0,31.0,0.548,,,,20.0,,,,27.0,59.0
5,1950.0,Ed Bartels,F,24.0,NYK,2.0,,,,0.376,,0.75,,,,,,,,,,0.0,0.0,0.0,,,,,,,1.0,4.0,0.25,,,,1.0,4.0,0.25,0.25,2.0,3.0,0.667,,,,0.0,,,,2.0,4.0
317,1951.0,Ed Bartels,F,25.0,WSC,17.0,,,,0.307,,0.474,,,,,,,,,,-0.8,0.2,-0.5,,,,,,,24.0,97.0,0.247,,,,24.0,97.0,0.247,0.247,24.0,46.0,0.522,,,84.0,12.0,,,,54.0,72.0


In [3724]:
duplicated_indices = seasons_stats_df[seasons_stats_df.duplicated(['Player', 'Year'], keep='last')].index

In [3725]:
seasons_stats_df.drop(duplicated_indices, inplace=True)

### Erweiterung durch Spielerdaten

In [3726]:
df = pd.merge(salaries_df, players_df, left_on='player_id', right_on='_id')

In [3727]:
df.drop(columns=['_id', 'player_id'], inplace=True)

In [3728]:
df.sample(7)

Unnamed: 0,salary,season_end,season_start,team,birth_country,college,height_in_inches,attended_high_school,name,shoots,weight,born_in_usa,attended_college,drafted_player
1834,885120,2013,2012,Minnesota Timberwolves,USA,University of Arizona,79,False,Chase Budinger,Right,209,True,True,False
11131,1070000,2004,2003,Los Angeles Lakers,USA,"California State University, Long Beach",79,False,Bryon Russell,Right,225,True,True,False
13240,17531250,2005,2004,Sacramento Kings,USA,University of Michigan,81,False,Chris Webber,Right,245,True,True,False
14154,603000,1996,1995,Charlotte Hornets,Czech Republic,"University of California, Los Angeles",84,False,George Zidek,Right,250,False,True,False
5497,4500000,2007,2006,Washington Wizards,USA,University of North Carolina,84,False,Brendan Haywood,Right,268,True,True,False
10811,1000000,2007,2006,New Jersey Nets,District of Columbia,University of Michigan,78,False,Bernard Robinson,Left,210,False,True,False
5268,6000000,2009,2008,Utah Jazz,USA,Georgia Institute of Technology,79,False,Matt Harpring,Right,231,True,True,False


### Erweiterung durch Saisonstatistiken

In [3729]:
df = pd.merge(df, seasons_stats_df, left_on=['name', 'season_end'], right_on=['Player', 'Year'])

In [3730]:
df.drop(columns=['name', 'Player', 'Year'], inplace=True)

### Ergebnis

In [3731]:
df.head()

Unnamed: 0,salary,season_end,season_start,team,birth_country,college,height_in_inches,attended_high_school,shoots,weight,born_in_usa,attended_college,drafted_player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,395000,1991,1990,Portland Trail Blazers,Egypt,Duke University,82,False,Right,240,False,True,False,PF,22.0,POR,43.0,0.0,290.0,13.1,0.499,0.0,0.379,10.4,23.4,17.0,5.8,0.7,2.5,14.0,22.1,,0.0,0.5,0.5,0.079,,-4.2,-0.7,-5.0,-0.2,55.0,116.0,0.474,0.0,0.0,,55.0,116.0,0.474,0.474,25.0,44.0,0.568,27.0,62.0,89.0,12.0,4.0,12.0,22.0,39.0,135.0
1,494000,1992,1991,Portland Trail Blazers,Egypt,Duke University,82,False,Right,240,False,True,False,PF,23.0,POR,71.0,1.0,934.0,13.5,0.533,0.0,0.28,9.5,20.9,15.2,4.7,1.3,1.1,14.0,20.6,,0.6,1.5,2.1,0.11,,-3.0,-0.9,-3.9,-0.5,178.0,361.0,0.493,0.0,0.0,,178.0,361.0,0.493,0.493,76.0,101.0,0.752,81.0,179.0,260.0,30.0,25.0,16.0,66.0,132.0,432.0
2,500000,1993,1992,Boston Celtics,Egypt,Duke University,82,False,Right,240,False,True,False,PF,24.0,BOS,63.0,52.0,1152.0,13.4,0.557,0.0,0.24,11.3,18.1,14.8,2.2,0.8,1.2,15.4,20.5,,0.7,1.2,1.9,0.079,,-2.1,-2.1,-4.1,-0.6,219.0,417.0,0.525,0.0,0.0,,219.0,417.0,0.525,0.525,76.0,100.0,0.76,114.0,186.0,300.0,17.0,19.0,22.0,84.0,165.0,514.0
3,805000,1994,1993,Boston Celtics,Egypt,Duke University,82,False,Right,240,False,True,False,PF,25.0,BOS,13.0,0.0,159.0,9.2,0.485,0.0,0.455,8.5,24.2,16.3,2.7,0.6,1.2,20.5,22.6,,-0.2,0.1,-0.1,-0.032,,-7.1,-3.1,-10.2,-0.3,24.0,55.0,0.436,0.0,0.0,,24.0,55.0,0.436,0.436,16.0,25.0,0.64,12.0,34.0,46.0,3.0,2.0,3.0,17.0,20.0,64.0
4,650000,1995,1994,Sacramento Kings,Egypt,Duke University,82,False,Right,240,False,True,False,PF,26.0,PHI,3.0,0.0,30.0,-12.5,0.091,0.0,0.0,11.6,19.6,15.5,0.0,0.0,0.0,31.3,23.8,,-0.3,0.0,-0.3,-0.466,,-21.1,-6.9,-28.0,-0.2,1.0,11.0,0.091,0.0,0.0,,1.0,11.0,0.091,0.091,0.0,0.0,,3.0,5.0,8.0,0.0,0.0,0.0,5.0,2.0,2.0


In [3732]:
df.tail()

Unnamed: 0,salary,season_end,season_start,team,birth_country,college,height_in_inches,attended_high_school,shoots,weight,born_in_usa,attended_college,drafted_player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
12228,694000,1997,1996,Charlotte Hornets,Czech Republic,"University of California, Los Angeles",84,False,Right,250,False,True,False,C,23.0,DEN,16.0,0.0,88.0,20.3,0.591,0.0,0.758,13.1,16.6,14.9,10.3,0.6,0.0,8.3,24.4,,0.3,0.0,0.4,0.2,,1.6,-3.9,-2.3,0.0,16.0,33.0,0.485,0.0,0.0,,16.0,33.0,0.485,0.485,20.0,25.0,0.8,10.0,13.0,23.0,5.0,1.0,0.0,4.0,17.0,52.0
12229,784200,1998,1997,Denver Nuggets,Czech Republic,"University of California, Los Angeles",84,False,Right,250,False,True,False,C,24.0,SEA,6.0,0.0,22.0,2.3,0.349,0.143,0.286,0.0,21.3,10.9,7.1,0.0,0.0,6.0,35.3,,-0.1,0.0,-0.1,-0.12,,-11.6,-7.3,-18.9,-0.1,3.0,14.0,0.214,1.0,2.0,0.5,2.0,12.0,0.167,0.25,4.0,4.0,1.0,0.0,4.0,4.0,1.0,0.0,0.0,1.0,5.0,11.0
12230,950000,2017,2016,Orlando Magic,USA,"University of Nevada, Las Vegas",84,False,Left,240,True,True,False,C,20.0,ORL,19.0,0.0,108.0,7.3,0.346,0.0,0.161,10.8,24.9,17.6,5.3,0.9,3.7,8.3,14.8,,-0.1,0.1,0.0,-0.005,,-7.8,0.4,-7.3,-0.1,10.0,31.0,0.323,0.0,0.0,,10.0,31.0,0.323,0.323,3.0,5.0,0.6,11.0,24.0,35.0,4.0,2.0,5.0,3.0,17.0,23.0
12231,750000,2017,2016,Chicago Bulls,Germany,NO_COLLEGE,80,True,Right,215,False,False,False,SF,22.0,CHI,44.0,18.0,843.0,6.9,0.503,0.448,0.181,1.9,14.2,8.0,6.1,0.9,1.5,14.4,14.4,,-0.3,0.8,0.5,0.03,,-3.6,-0.1,-3.7,-0.4,88.0,221.0,0.398,33.0,99.0,0.333,55.0,122.0,0.451,0.473,31.0,40.0,0.775,15.0,110.0,125.0,36.0,15.0,16.0,40.0,78.0,240.0
12232,1034956,2017,2016,Los Angeles Lakers,Bosnia and Herzegovina,NO_COLLEGE,85,True,Right,240,False,False,False,C,19.0,LAL,38.0,11.0,609.0,17.0,0.547,0.013,0.206,7.1,21.9,14.3,8.1,1.1,4.4,10.4,20.3,,0.6,0.5,1.1,0.086,,-2.7,0.3,-2.5,-0.1,126.0,238.0,0.529,0.0,3.0,0.0,126.0,235.0,0.536,0.529,32.0,49.0,0.653,41.0,118.0,159.0,30.0,14.0,33.0,30.0,66.0,284.0


In [3733]:
df.sample(7)

Unnamed: 0,salary,season_end,season_start,team,birth_country,college,height_in_inches,attended_high_school,shoots,weight,born_in_usa,attended_college,drafted_player,Pos,Age,Tm,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
5204,15937500,2004,2003,New York Knicks,USA,University of Tennessee,78,False,Right,200,True,True,False,SG,32.0,NYK,50.0,50.0,1799.0,14.8,0.539,0.259,0.22,1.3,6.4,3.9,10.3,1.1,0.1,10.6,24.7,,2.5,0.6,3.1,0.083,,1.3,-2.7,-1.4,0.3,340.0,781.0,0.435,87.0,202.0,0.431,253.0,579.0,0.437,0.491,157.0,172.0,0.913,20.0,101.0,121.0,99.0,38.0,2.0,102.0,105.0,924.0
6871,2038000,1996,1995,Detroit Pistons,USA,Temple University,77,False,Right,185,True,True,False,PG,26.0,DET,23.0,0.0,287.0,9.5,0.515,0.224,0.164,4.4,4.9,4.6,9.2,2.9,0.0,11.1,13.4,,0.2,0.4,0.5,0.092,,-1.5,0.3,-1.3,0.1,29.0,67.0,0.433,7.0,15.0,0.467,22.0,52.0,0.423,0.485,9.0,11.0,0.818,10.0,12.0,22.0,16.0,15.0,0.0,9.0,34.0,74.0
4549,5000000,2013,2012,Chicago Bulls,USA,University of Connecticut,78,False,Right,185,True,True,False,SG,34.0,CHI,50.0,45.0,1088.0,10.6,0.481,0.109,0.162,1.7,7.2,4.4,19.9,1.2,0.3,14.2,25.0,,-0.6,0.9,0.4,0.016,,-3.1,-2.4,-5.5,-1.0,204.0,475.0,0.429,16.0,52.0,0.308,188.0,423.0,0.444,0.446,66.0,77.0,0.857,16.0,67.0,83.0,119.0,24.0,5.0,84.0,93.0,490.0
5167,1200000,1994,1993,Houston Rockets,USA,University of Alabama,81,False,Right,220,True,True,False,SF,23.0,HOU,81.0,81.0,2370.0,14.2,0.521,0.194,0.224,6.4,13.9,10.3,14.5,2.5,2.0,15.1,17.0,,1.6,4.3,5.9,0.12,,-0.6,3.1,2.5,2.7,322.0,702.0,0.459,44.0,136.0,0.324,278.0,566.0,0.491,0.49,115.0,157.0,0.732,128.0,312.0,440.0,231.0,119.0,75.0,137.0,186.0,803.0
5072,412718,2007,2006,Miami Heat,USA,University of Miami,74,False,Left,184,True,True,True,SG,23.0,MIA,12.0,0.0,136.0,7.4,0.391,0.383,0.2,3.6,10.2,6.9,9.9,1.2,1.2,8.4,24.8,,-0.3,0.1,-0.2,-0.055,,-4.9,-2.9,-7.9,-0.2,19.0,60.0,0.317,5.0,23.0,0.217,14.0,37.0,0.378,0.358,8.0,12.0,0.667,4.0,12.0,16.0,8.0,3.0,2.0,6.0,8.0,51.0
1684,810000,2007,2006,Phoenix Suns,Ireland,Auburn University,83,False,Left,250,False,True,True,C,33.0,PHO,23.0,0.0,164.0,9.3,0.424,0.338,0.2,9.0,23.7,16.7,4.3,0.9,0.9,9.0,21.3,,-0.1,0.2,0.1,0.015,,-4.6,-2.3,-6.9,-0.2,23.0,65.0,0.354,6.0,22.0,0.273,17.0,43.0,0.395,0.4,8.0,13.0,0.615,12.0,35.0,47.0,5.0,3.0,2.0,7.0,19.0,60.0
10105,5200000,2008,2007,Cleveland Cavaliers,USA,University of Maryland,82,False,Right,225,True,True,False,PF,32.0,CLE,27.0,1.0,580.0,14.9,0.547,0.012,0.406,10.5,16.4,13.4,5.6,0.8,2.2,9.9,17.4,,0.9,0.6,1.5,0.128,,-1.7,-0.5,-2.1,0.0,87.0,170.0,0.512,0.0,2.0,0.0,87.0,168.0,0.518,0.512,45.0,69.0,0.652,55.0,81.0,136.0,19.0,9.0,16.0,22.0,73.0,219.0


In [3734]:
df.describe()

Unnamed: 0,salary,season_end,season_start,height_in_inches,weight,Age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,blanl,OWS,DWS,WS,WS/48,blank2,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
count,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12230.0,12194.0,12190.0,12190.0,12230.0,12230.0,12230.0,12230.0,12230.0,12230.0,12201.0,12230.0,0.0,12233.0,12233.0,12233.0,12230.0,0.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12190.0,12233.0,12233.0,10299.0,12233.0,12233.0,12170.0,12190.0,12233.0,12233.0,11874.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0
mean,3031394.0,2003.357884,2002.357884,79.03041,217.824001,26.8352,52.998856,25.674569,1250.241968,12.748005,0.509993,0.181456,0.313901,6.075307,13.993156,10.035119,13.009992,1.633827,1.513083,14.484821,18.76139,,1.335151,1.264236,2.599722,0.073276,,-1.519186,-0.421025,-1.93999,0.599681,193.312679,422.900924,0.440695,27.813946,78.608273,0.263368,165.498733,344.292651,0.46129,0.471028,96.447642,127.966157,0.724607,62.749203,155.114363,217.863566,115.144609,40.730565,25.356004,74.43399,114.18213,510.886945
std,3922321.0,8.841971,8.841971,3.679179,27.400892,4.091276,25.389561,29.195356,909.889058,5.61119,0.086252,0.194209,0.22189,4.535725,6.298117,4.85206,9.446561,0.935333,1.647488,6.149822,5.212992,,1.992772,1.195955,2.882877,0.088705,,3.483871,2.085438,4.245327,1.314281,172.3409,367.296754,0.089236,42.852526,113.235908,0.169878,155.173812,313.580979,0.092562,0.090847,102.786317,130.38683,0.140343,64.354985,139.585459,197.581579,136.371164,36.603116,34.689186,63.50923,77.476732,461.19819
min,2853.0,1985.0,1984.0,63.0,133.0,18.0,1.0,0.0,0.0,-90.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,-3.3,-0.6,-2.1,-2.519,,-73.8,-19.5,-86.7,-2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,612435.0,1996.0,1995.0,76.0,195.0,24.0,31.0,1.0,423.0,10.1,0.479,0.007,0.196,2.5,9.3,6.2,6.3,1.1,0.4,11.1,15.3,,0.0,0.3,0.3,0.04,,-3.1,-1.6,-3.7,-0.2,49.0,116.0,0.405,0.0,2.0,0.161,39.0,88.0,0.428,0.44,20.0,29.0,0.664,15.0,45.0,63.0,21.0,11.0,4.0,22.0,46.0,129.0
50%,1450000.0,2004.0,2003.0,80.0,220.0,26.0,60.0,11.0,1137.0,12.9,0.52,0.111,0.283,5.1,13.1,9.2,10.2,1.5,1.0,13.8,18.5,,0.6,0.9,1.7,0.082,,-1.3,-0.4,-1.5,0.1,148.0,330.0,0.444,5.0,21.0,0.31,120.0,257.0,0.468,0.479,63.0,87.0,0.75,41.0,122.0,168.0,68.0,32.0,13.0,59.0,110.0,390.0
75%,3765000.0,2011.0,2010.0,82.0,237.0,30.0,76.0,51.0,1991.0,15.7,0.554,0.323,0.38775,9.0,18.1,13.5,17.6,2.0,2.1,16.9,22.0,,2.1,1.9,4.0,0.119,,0.5,0.8,0.5,1.0,296.0,646.0,0.484,41.0,119.0,0.37,249.0,515.0,0.503,0.513,138.0,187.0,0.814,91.0,224.0,313.0,156.0,61.0,32.0,111.0,173.0,782.0
max,30963450.0,2017.0,2016.0,91.0,360.0,42.0,82.0,82.0,3533.0,88.3,1.136,1.0,6.0,100.0,100.0,100.0,78.5,17.3,26.3,100.0,88.3,,14.8,9.1,20.3,1.084,,31.9,17.1,26.6,12.4,978.0,2173.0,1.0,402.0,886.0,1.0,802.0,1685.0,1.0,1.5,756.0,916.0,1.0,443.0,894.0,1258.0,991.0,301.0,456.0,464.0,371.0,2832.0


In [3735]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12233 entries, 0 to 12232
Data columns (total 63 columns):
salary                  12233 non-null int64
season_end              12233 non-null int64
season_start            12233 non-null int64
team                    12233 non-null object
birth_country           12233 non-null object
college                 12233 non-null object
height_in_inches        12233 non-null int64
attended_high_school    12233 non-null bool
shoots                  12233 non-null object
weight                  12233 non-null int64
born_in_usa             12233 non-null bool
attended_college        12233 non-null bool
drafted_player          12233 non-null bool
Pos                     12233 non-null object
Age                     12233 non-null float64
Tm                      12233 non-null object
G                       12233 non-null float64
GS                      12233 non-null float64
MP                      12233 non-null float64
PER                     12

### Weiteres Processing

#### Nicht identifizierbare Spalten

Durch den zuvor erlangten Überblick der Splaten lassen sich zwei Spalten ohne Datenpunkte erkennen. Diese sollen entfernt werden.

In [3736]:
df.drop(columns=['blanl', 'blank2'], inplace=True)

#### Teamname

Die Spalte, wo die Namen der Teams länger sind sollen entfernt werden. Die andere Spalte mit den kürzeren Namen wird umbenannt. Dadurch sollen die resultierenden Spaltennamen beim _One Hot Encoding_ etwas schöner werden.

In [3737]:
df.drop(columns=['team'], inplace=True)
df.rename(columns={"Tm": "team"}, inplace=True)

#### Spalten umbennen

Gewisse Spaltennamen müssen noch unbenannt werden. Nun sollten alle Features, abgesehen von den Saisonstatistiken k

In [3738]:
df.rename(columns={"Pos": "position", "Age": "age"}, inplace=True)

#### Ergebnis

In [3739]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12233 entries, 0 to 12232
Data columns (total 60 columns):
salary                  12233 non-null int64
season_end              12233 non-null int64
season_start            12233 non-null int64
birth_country           12233 non-null object
college                 12233 non-null object
height_in_inches        12233 non-null int64
attended_high_school    12233 non-null bool
shoots                  12233 non-null object
weight                  12233 non-null int64
born_in_usa             12233 non-null bool
attended_college        12233 non-null bool
drafted_player          12233 non-null bool
position                12233 non-null object
age                     12233 non-null float64
team                    12233 non-null object
G                       12233 non-null float64
GS                      12233 non-null float64
MP                      12233 non-null float64
PER                     12230 non-null float64
TS%                     1

### Fehlende Datenpunkte behandeln

In [3740]:
df.isnull().sum().sort_values(ascending=False)

3P%                     1934
FT%                      359
2P%                       63
FTr                       43
FG%                       43
eFG%                      43
3PAr                      43
TS%                       39
TOV%                      32
ORB%                       3
TRB%                       3
DRB%                       3
WS/48                      3
BLK%                       3
PER                        3
STL%                       3
USG%                       3
AST%                       3
age                        0
shoots                     0
season_end                 0
season_start               0
birth_country              0
college                    0
height_in_inches           0
attended_high_school       0
weight                     0
drafted_player             0
born_in_usa                0
attended_college           0
MP                         0
GS                         0
G                          0
team                       0
position      

Es existieren nur fehlende Daten in den Saisonstatistiken. Diese werden mit dem Durchschnittswert befüllt.

In [3741]:
features_with_missing_data = [
    "3P%",
    "FT%",                   
    "2P%",                     
    "FTr",                 
    "FG%",                     
    "eFG%",                   
    "3PAr",                   
    "TS%",                     
    "TOV%",                     
    "ORB%",                    
    "TRB%",                     
    "DRB%",                     
    "WS/48",                  
    "BLK%",                    
    "PER",                       
    "STL%",                      
    "USG%",                       
    "AST%",                     
]

for feature in features_with_missing_data:
    df[feature] = df[feature].fillna(df[feature].mean())      

In [3742]:
df.isnull().sum().sort_values(ascending=False)

PTS                     0
PF                      0
BLK%                    0
STL%                    0
AST%                    0
TRB%                    0
DRB%                    0
ORB%                    0
FTr                     0
3PAr                    0
TS%                     0
PER                     0
MP                      0
GS                      0
G                       0
team                    0
age                     0
position                0
drafted_player          0
attended_college        0
born_in_usa             0
weight                  0
shoots                  0
attended_high_school    0
height_in_inches        0
college                 0
birth_country           0
season_start            0
season_end              0
TOV%                    0
USG%                    0
OWS                     0
2PA                     0
TOV                     0
BLK                     0
STL                     0
AST                     0
TRB                     0
DRB         

### One Hot Encoding (& Feature-Auswahl)

In [3743]:
df.select_dtypes('bool').columns.values

array(['attended_high_school', 'born_in_usa', 'attended_college',
       'drafted_player'], dtype=object)

In [3744]:
df.select_dtypes('object').columns.values

array(['birth_country', 'college', 'shoots', 'position', 'team'],
      dtype=object)

#### Attended High School

In [3745]:
df = pd.get_dummies(df, columns=['attended_high_school'])

#### Drafted Player

In [3746]:
df = pd.get_dummies(df, columns=['drafted_player'])

#### Shoots

In [3747]:
df = pd.get_dummies(df, columns=['shoots'])

#### Position

Ohne Verwendung von diesem Feature würde der _Linear Regression Score_ etwas veringert sein.

In [3748]:
# df.drop(columns=['position'], inplace=True)

In [3749]:
df = pd.get_dummies(df, columns=['position'])

#### Team

Ohne Verwendung von diesem Feature würde der _Linear Regression Score_ etwas veringert sein.

In [3750]:
# df.drop(columns=['team'], inplace=True)

In [3751]:
df = pd.get_dummies(df, columns=['team'])

#### college vs. attended_college

##### Verwendung des College-Feature

Die Verwendung von diesem Feature resultiert in einem negativen _Linear Regression Score_. Das Feature kann somit ignoriert werden.

In [3752]:
df.drop(columns=['college'], inplace=True)

In [3753]:
# df = pd.get_dummies(df, columns=['college'])
# df.drop(columns=['attended_college'], inplace=True)

##### Verwendung des Attended-College-Feature

Die Verwendung von diesem Feature resultiert in einer 1-2% Steigerung des _Linear Regression Score_. Das Feature kann somit verwendet werden.

In [3754]:
# df.drop(columns=['attended_college'], inplace=True)

In [3755]:
df = pd.get_dummies(df, columns=['attended_college'])
# df.drop(columns=['college'], inplace=True)

#### birth_country vs. born_in_usa

##### Verwendung des Birth-Country Feature

Die Verwendung von diesem Feature resultiert in einem negativen _Linear Regression Score_. Das Feature kann somit ignoriert werden.

In [3756]:
df.drop(columns=['birth_country'], inplace=True)

In [3757]:
# df = pd.get_dummies(df, columns=['birth_country'])
# df.drop(columns=['born_in_usa'], inplace=True)

##### Verwendung des Born-In-Usa-Feature

Die Verwendung von diesem Feature führt zu einer minimalen Verringerung (~0.02%) des _Linear Regression Score_. Es liefert somit jedoch keinen Mehrwert und kann somit ignoriert werden.

In [3758]:
df.drop(columns=['born_in_usa'], inplace=True)

In [3759]:
# df = pd.get_dummies(df, columns=['born_in_usa'])
# df.drop(columns=['birth_country'], inplace=True)

### Ergebnis

In [3760]:
df.head()

Unnamed: 0,salary,season_end,season_start,height_in_inches,weight,age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,attended_high_school_False,attended_high_school_True,drafted_player_False,drafted_player_True,shoots_Left,shoots_Left Right,shoots_Right,position_C,position_PF,position_PG,position_SF,position_SG,team_ATL,team_BOS,team_BRK,team_CHA,team_CHH,team_CHI,team_CHO,team_CLE,team_DAL,team_DEN,team_DET,team_GSW,team_HOU,team_IND,team_KCK,team_LAC,team_LAL,team_MEM,team_MIA,team_MIL,team_MIN,team_NJN,team_NOH,team_NOK,team_NOP,team_NYK,team_OKC,team_ORL,team_PHI,team_PHO,team_POR,team_SAC,team_SAS,team_SEA,team_TOR,team_UTA,team_VAN,team_WAS,team_WSB,attended_college_False,attended_college_True
0,395000,1991,1990,82,240,22.0,43.0,0.0,290.0,13.1,0.499,0.0,0.379,10.4,23.4,17.0,5.8,0.7,2.5,14.0,22.1,0.0,0.5,0.5,0.079,-4.2,-0.7,-5.0,-0.2,55.0,116.0,0.474,0.0,0.0,0.263368,55.0,116.0,0.474,0.474,25.0,44.0,0.568,27.0,62.0,89.0,12.0,4.0,12.0,22.0,39.0,135.0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,494000,1992,1991,82,240,23.0,71.0,1.0,934.0,13.5,0.533,0.0,0.28,9.5,20.9,15.2,4.7,1.3,1.1,14.0,20.6,0.6,1.5,2.1,0.11,-3.0,-0.9,-3.9,-0.5,178.0,361.0,0.493,0.0,0.0,0.263368,178.0,361.0,0.493,0.493,76.0,101.0,0.752,81.0,179.0,260.0,30.0,25.0,16.0,66.0,132.0,432.0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,500000,1993,1992,82,240,24.0,63.0,52.0,1152.0,13.4,0.557,0.0,0.24,11.3,18.1,14.8,2.2,0.8,1.2,15.4,20.5,0.7,1.2,1.9,0.079,-2.1,-2.1,-4.1,-0.6,219.0,417.0,0.525,0.0,0.0,0.263368,219.0,417.0,0.525,0.525,76.0,100.0,0.76,114.0,186.0,300.0,17.0,19.0,22.0,84.0,165.0,514.0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,805000,1994,1993,82,240,25.0,13.0,0.0,159.0,9.2,0.485,0.0,0.455,8.5,24.2,16.3,2.7,0.6,1.2,20.5,22.6,-0.2,0.1,-0.1,-0.032,-7.1,-3.1,-10.2,-0.3,24.0,55.0,0.436,0.0,0.0,0.263368,24.0,55.0,0.436,0.436,16.0,25.0,0.64,12.0,34.0,46.0,3.0,2.0,3.0,17.0,20.0,64.0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,650000,1995,1994,82,240,26.0,3.0,0.0,30.0,-12.5,0.091,0.0,0.0,11.6,19.6,15.5,0.0,0.0,0.0,31.3,23.8,-0.3,0.0,-0.3,-0.466,-21.1,-6.9,-28.0,-0.2,1.0,11.0,0.091,0.0,0.0,0.263368,1.0,11.0,0.091,0.091,0.0,0.0,0.724607,3.0,5.0,8.0,0.0,0.0,0.0,5.0,2.0,2.0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1


In [3761]:
df.tail()

Unnamed: 0,salary,season_end,season_start,height_in_inches,weight,age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,attended_high_school_False,attended_high_school_True,drafted_player_False,drafted_player_True,shoots_Left,shoots_Left Right,shoots_Right,position_C,position_PF,position_PG,position_SF,position_SG,team_ATL,team_BOS,team_BRK,team_CHA,team_CHH,team_CHI,team_CHO,team_CLE,team_DAL,team_DEN,team_DET,team_GSW,team_HOU,team_IND,team_KCK,team_LAC,team_LAL,team_MEM,team_MIA,team_MIL,team_MIN,team_NJN,team_NOH,team_NOK,team_NOP,team_NYK,team_OKC,team_ORL,team_PHI,team_PHO,team_POR,team_SAC,team_SAS,team_SEA,team_TOR,team_UTA,team_VAN,team_WAS,team_WSB,attended_college_False,attended_college_True
12228,694000,1997,1996,84,250,23.0,16.0,0.0,88.0,20.3,0.591,0.0,0.758,13.1,16.6,14.9,10.3,0.6,0.0,8.3,24.4,0.3,0.0,0.4,0.2,1.6,-3.9,-2.3,0.0,16.0,33.0,0.485,0.0,0.0,0.263368,16.0,33.0,0.485,0.485,20.0,25.0,0.8,10.0,13.0,23.0,5.0,1.0,0.0,4.0,17.0,52.0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
12229,784200,1998,1997,84,250,24.0,6.0,0.0,22.0,2.3,0.349,0.143,0.286,0.0,21.3,10.9,7.1,0.0,0.0,6.0,35.3,-0.1,0.0,-0.1,-0.12,-11.6,-7.3,-18.9,-0.1,3.0,14.0,0.214,1.0,2.0,0.5,2.0,12.0,0.167,0.25,4.0,4.0,1.0,0.0,4.0,4.0,1.0,0.0,0.0,1.0,5.0,11.0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
12230,950000,2017,2016,84,240,20.0,19.0,0.0,108.0,7.3,0.346,0.0,0.161,10.8,24.9,17.6,5.3,0.9,3.7,8.3,14.8,-0.1,0.1,0.0,-0.005,-7.8,0.4,-7.3,-0.1,10.0,31.0,0.323,0.0,0.0,0.263368,10.0,31.0,0.323,0.323,3.0,5.0,0.6,11.0,24.0,35.0,4.0,2.0,5.0,3.0,17.0,23.0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
12231,750000,2017,2016,80,215,22.0,44.0,18.0,843.0,6.9,0.503,0.448,0.181,1.9,14.2,8.0,6.1,0.9,1.5,14.4,14.4,-0.3,0.8,0.5,0.03,-3.6,-0.1,-3.7,-0.4,88.0,221.0,0.398,33.0,99.0,0.333,55.0,122.0,0.451,0.473,31.0,40.0,0.775,15.0,110.0,125.0,36.0,15.0,16.0,40.0,78.0,240.0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
12232,1034956,2017,2016,85,240,19.0,38.0,11.0,609.0,17.0,0.547,0.013,0.206,7.1,21.9,14.3,8.1,1.1,4.4,10.4,20.3,0.6,0.5,1.1,0.086,-2.7,0.3,-2.5,-0.1,126.0,238.0,0.529,0.0,3.0,0.0,126.0,235.0,0.536,0.529,32.0,49.0,0.653,41.0,118.0,159.0,30.0,14.0,33.0,30.0,66.0,284.0,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [3762]:
df.sample(7)

Unnamed: 0,salary,season_end,season_start,height_in_inches,weight,age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,attended_high_school_False,attended_high_school_True,drafted_player_False,drafted_player_True,shoots_Left,shoots_Left Right,shoots_Right,position_C,position_PF,position_PG,position_SF,position_SG,team_ATL,team_BOS,team_BRK,team_CHA,team_CHH,team_CHI,team_CHO,team_CLE,team_DAL,team_DEN,team_DET,team_GSW,team_HOU,team_IND,team_KCK,team_LAC,team_LAL,team_MEM,team_MIA,team_MIL,team_MIN,team_NJN,team_NOH,team_NOK,team_NOP,team_NYK,team_OKC,team_ORL,team_PHI,team_PHO,team_POR,team_SAC,team_SAS,team_SEA,team_TOR,team_UTA,team_VAN,team_WAS,team_WSB,attended_college_False,attended_college_True
6246,893400,2003,2002,81,220,21.0,80.0,11.0,2213.0,21.1,0.598,0.178,0.576,8.4,14.4,11.5,10.6,2.9,6.1,14.5,19.6,5.3,3.8,9.2,0.199,2.6,4.0,6.6,4.8,315.0,642.0,0.491,37.0,114.0,0.325,278.0,528.0,0.527,0.519,296.0,370.0,0.8,147.0,273.0,420.0,138.0,118.0,175.0,136.0,185.0,963.0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1213,2350820,2014,2013,80,228,26.0,72.0,45.0,1553.0,15.0,0.563,0.005,0.189,10.9,17.5,14.1,6.3,1.4,2.4,11.7,14.4,2.3,2.0,4.3,0.132,-1.1,1.3,0.2,0.8,222.0,403.0,0.551,0.0,2.0,0.0,222.0,401.0,0.554,0.551,47.0,76.0,0.618,149.0,230.0,379.0,64.0,41.0,45.0,58.0,131.0,491.0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
1445,1118520,2010,2009,72,161,25.0,82.0,82.0,2919.0,16.0,0.549,0.394,0.224,2.0,6.6,4.2,25.6,1.2,0.3,13.7,25.7,3.9,1.6,5.5,0.091,2.7,-2.5,0.2,1.6,575.0,1331.0,0.432,209.0,525.0,0.398,366.0,806.0,0.454,0.511,245.0,298.0,0.822,54.0,161.0,215.0,434.0,69.0,14.0,232.0,199.0,1604.0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4290,2429000,1994,1993,80,195,28.0,77.0,73.0,2112.0,12.3,0.48,0.009,0.169,5.4,13.0,9.0,7.4,1.6,1.5,6.3,17.1,1.7,2.2,3.9,0.089,-1.6,0.5,-1.1,0.5,356.0,774.0,0.46,2.0,7.0,0.286,354.0,767.0,0.462,0.461,84.0,131.0,0.641,109.0,242.0,351.0,107.0,70.0,49.0,56.0,179.0,798.0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
11119,614000,1997,1996,81,240,23.0,35.0,6.0,298.0,10.3,0.475,0.0,0.417,13.4,24.2,18.7,3.8,1.5,3.9,25.4,17.7,-0.4,0.5,0.1,0.019,-6.8,0.8,-6.0,-0.3,31.0,72.0,0.431,0.0,0.0,0.263368,31.0,72.0,0.431,0.431,19.0,30.0,0.633,35.0,60.0,95.0,7.0,8.0,15.0,29.0,43.0,81.0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
1300,2000000,1991,1990,85,235,29.0,62.0,51.0,1916.0,14.1,0.486,0.03,0.32,9.3,17.3,13.1,11.4,1.1,2.8,14.6,20.3,0.4,2.2,2.6,0.066,-1.8,1.2,-0.6,0.7,314.0,723.0,0.434,4.0,22.0,0.182,310.0,701.0,0.442,0.437,169.0,231.0,0.732,176.0,304.0,480.0,147.0,43.0,90.0,141.0,175.0,801.0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3543,135000,1992,1991,83,235,29.0,34.0,0.0,175.0,11.9,0.48,0.0,0.247,12.3,25.1,18.4,5.0,0.6,1.0,18.2,23.9,-0.1,0.2,0.1,0.019,-5.8,-3.5,-9.3,-0.3,33.0,77.0,0.429,0.0,0.0,0.263368,33.0,77.0,0.429,0.429,16.0,19.0,0.842,21.0,40.0,61.0,6.0,2.0,3.0,19.0,22.0,82.0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [3763]:
df.describe()

Unnamed: 0,salary,season_end,season_start,height_in_inches,weight,age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,attended_high_school_False,attended_high_school_True,drafted_player_False,drafted_player_True,shoots_Left,shoots_Left Right,shoots_Right,position_C,position_PF,position_PG,position_SF,position_SG,team_ATL,team_BOS,team_BRK,team_CHA,team_CHH,team_CHI,team_CHO,team_CLE,team_DAL,team_DEN,team_DET,team_GSW,team_HOU,team_IND,team_KCK,team_LAC,team_LAL,team_MEM,team_MIA,team_MIL,team_MIN,team_NJN,team_NOH,team_NOK,team_NOP,team_NYK,team_OKC,team_ORL,team_PHI,team_PHO,team_POR,team_SAC,team_SAS,team_SEA,team_TOR,team_UTA,team_VAN,team_WAS,team_WSB,attended_college_False,attended_college_True
count,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0,12233.0
mean,3031394.0,2003.357884,2002.357884,79.03041,217.824001,26.8352,52.998856,25.674569,1250.241968,12.748005,0.509993,0.181456,0.313901,6.075307,13.993156,10.035119,13.009992,1.633827,1.513083,14.484821,18.76139,1.335151,1.264236,2.599722,0.073276,-1.519186,-0.421025,-1.93999,0.599681,193.312679,422.900924,0.440695,27.813946,78.608273,0.263368,165.498733,344.292651,0.46129,0.471028,96.447642,127.966157,0.724607,62.749203,155.114363,217.863566,115.144609,40.730565,25.356004,74.43399,114.18213,510.886945,0.934767,0.065233,0.866754,0.133246,0.083871,0.00049,0.915638,0.204283,0.216382,0.196599,0.189978,0.192757,0.035233,0.033434,0.006948,0.012752,0.014224,0.033271,0.003842,0.037767,0.036786,0.033843,0.033107,0.035314,0.031064,0.034906,0.000981,0.037767,0.03229,0.021009,0.033761,0.035233,0.032044,0.03041,0.011117,0.002616,0.006458,0.034333,0.011526,0.031145,0.034006,0.03556,0.033843,0.033516,0.035968,0.023052,0.026976,0.031717,0.006621,0.024115,0.011444,0.11191,0.88809
std,3922321.0,8.841971,8.841971,3.679179,27.400892,4.091276,25.389561,29.195356,909.889058,5.610502,0.086114,0.193867,0.2215,4.535169,6.297344,4.851464,9.445402,0.935218,1.647286,6.141772,5.212353,1.992772,1.195955,2.882877,0.088694,3.483871,2.085438,4.245327,1.314281,172.3409,367.296754,0.089079,42.852526,113.235908,0.155871,155.173812,313.580979,0.092323,0.090687,102.786317,130.38683,0.138268,64.354985,139.585459,197.581579,136.371164,36.603116,34.689186,63.50923,77.476732,461.19819,0.246947,0.246947,0.339854,0.339854,0.277206,0.022142,0.277941,0.403194,0.411794,0.397443,0.392299,0.39448,0.184375,0.179775,0.08307,0.112209,0.118417,0.17935,0.061868,0.190639,0.188243,0.180832,0.178924,0.184581,0.173496,0.183548,0.031306,0.190639,0.176776,0.143419,0.180621,0.184375,0.176125,0.171718,0.104856,0.051081,0.080105,0.182091,0.106744,0.173717,0.181253,0.185197,0.180832,0.179987,0.186219,0.150076,0.16202,0.175254,0.081106,0.153413,0.106369,0.315269,0.315269
min,2853.0,1985.0,1984.0,63.0,133.0,18.0,1.0,0.0,0.0,-90.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.3,-0.6,-2.1,-2.519,-73.8,-19.5,-86.7,-2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,612435.0,1996.0,1995.0,76.0,195.0,24.0,31.0,1.0,423.0,10.1,0.48,0.007,0.197,2.5,9.3,6.2,6.3,1.1,0.4,11.1,15.3,0.0,0.3,0.3,0.04,-3.1,-1.6,-3.7,-0.2,49.0,116.0,0.405,0.0,2.0,0.2,39.0,88.0,0.428,0.44,20.0,29.0,0.667,15.0,45.0,63.0,21.0,11.0,4.0,22.0,46.0,129.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1450000.0,2004.0,2003.0,80.0,220.0,26.0,60.0,11.0,1137.0,12.9,0.52,0.112,0.284,5.1,13.1,9.2,10.2,1.5,1.0,13.8,18.5,0.6,0.9,1.7,0.082,-1.3,-0.4,-1.5,0.1,148.0,330.0,0.443,5.0,21.0,0.273,120.0,257.0,0.468,0.479,63.0,87.0,0.746,41.0,122.0,168.0,68.0,32.0,13.0,59.0,110.0,390.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,3765000.0,2011.0,2010.0,82.0,237.0,30.0,76.0,51.0,1991.0,15.7,0.553,0.322,0.387,9.0,18.1,13.5,17.6,2.0,2.1,16.9,22.0,2.1,1.9,4.0,0.119,0.5,0.8,0.5,1.0,296.0,646.0,0.484,41.0,119.0,0.359,249.0,515.0,0.503,0.513,138.0,187.0,0.812,91.0,224.0,313.0,156.0,61.0,32.0,111.0,173.0,782.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,30963450.0,2017.0,2016.0,91.0,360.0,42.0,82.0,82.0,3533.0,88.3,1.136,1.0,6.0,100.0,100.0,100.0,78.5,17.3,26.3,100.0,88.3,14.8,9.1,20.3,1.084,31.9,17.1,26.6,12.4,978.0,2173.0,1.0,402.0,886.0,1.0,802.0,1685.0,1.0,1.5,756.0,916.0,1.0,443.0,894.0,1258.0,991.0,301.0,456.0,464.0,371.0,2832.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3764]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12233 entries, 0 to 12232
Columns: 104 entries, salary to attended_college_True
dtypes: float64(46), int64(5), uint8(53)
memory usage: 5.5 MB


In [3765]:
df.columns

Index(['salary', 'season_end', 'season_start', 'height_in_inches', 'weight',
       'age', 'G', 'GS', 'MP', 'PER',
       ...
       'team_SAC', 'team_SAS', 'team_SEA', 'team_TOR', 'team_UTA', 'team_VAN',
       'team_WAS', 'team_WSB', 'attended_college_False',
       'attended_college_True'],
      dtype='object', length=104)

In [3766]:
correlations_df = pd.DataFrame(abs(df.corr().salary).sort_values(ascending = False))
correlations_df.head(10)

Unnamed: 0,salary
salary,1.0
WS,0.426963
VORP,0.415971
PTS,0.414596
DRB,0.408879
FGA,0.400793
FG,0.400359
OWS,0.395474
FT,0.38962
FTA,0.385824


In [3767]:
top_10_correlations = [
    "WS",
    "VORP",
    "PTS",
    "DRB",
    "FGA", 
    "FG",
    "OWS", 
    "FT",
    "FTA"
]

In [3768]:
correlations_df.head(20)

Unnamed: 0,salary
salary,1.0
WS,0.426963
VORP,0.415971
PTS,0.414596
DRB,0.408879
FGA,0.400793
FG,0.400359
OWS,0.395474
FT,0.38962
FTA,0.385824


# === === === === === === === === === ===
# Train / Test Split

In [3769]:
split_df = df

## Trainingsdaten

In [3770]:
X = split_df.drop(columns = ["salary"])

### Top Ten Correlation Features

Die zuvor berechneten top 10 Correlation Features zu nehmen würde in einem schlechteren _Linear Regression Score_ resultieren.

In [3771]:
# X = X[top_10_correlations]

### Top 30 Linear Regression Score Features

Die später berechneten top 30 Linear Regression Features zu nehmen würde in einem schlechteren _Linear Regression Score_ resultieren.

In [3772]:
"""
top_30 = [
    'TRB', 'PTS', 'DRB', 'drafted_player_False', 'drafted_player_True',
    'FG', 'FGA', '2PA', 'attended_college_False',
    'attended_college_True', 'ORB', 'FT', '3PA',
    'attended_high_school_False', 'attended_high_school_True', '3P',
    '2P', 'position_PF', 'shoots_Right', 'shoots_Left', 'position_C',
    'position_PG', 'position_SF', 'position_SG', 'team_LAC',
    'team_CLE', 'team_PHO', 'team_IND', 'team_GSW', 'team_NYK'
]
X = X[top_30]
"""

"\ntop_30 = [\n    'TRB', 'PTS', 'DRB', 'drafted_player_False', 'drafted_player_True',\n    'FG', 'FGA', '2PA', 'attended_college_False',\n    'attended_college_True', 'ORB', 'FT', '3PA',\n    'attended_high_school_False', 'attended_high_school_True', '3P',\n    '2P', 'position_PF', 'shoots_Right', 'shoots_Left', 'position_C',\n    'position_PG', 'position_SF', 'position_SG', 'team_LAC',\n    'team_CLE', 'team_PHO', 'team_IND', 'team_GSW', 'team_NYK'\n]\nX = X[top_30]\n"

### Ausschließlich Saisonstatistiken 

Die ausschließliche Verwendung der Saisonstatistiken würde in einem schlechteren _Linear Regression Score_ resultieren.

In [3773]:
statistics = [
    "3P%",
    "FT%",                   
    "2P%",                      
    "FTr",                       
    "FG%",                     
    "eFG%",                  
    "3PAr",                   
    "TS%",                    
    "TOV%",                  
    "ORB%",                      
    "TRB%",                      
    "DRB%",                      
    "WS/48",                     
    "BLK%",                       
    "PER",                        
    "STL%",                     
    "USG%",                      
    "AST%",
    "MP",                         
    "GS",                         
    "G",                                                           
    "PTS",                        
    "PF",                         
    "OWS",                        
    "TOV",                        
    "BLK",                        
    "STL",                        
    "AST",                        
    "TRB",                        
    "DRB",                        
    "ORB",                        
    "FTA",                        
    "FT",                         
    "2PA",                        
    "2P",                         
    "3PA",                        
    "3P",                         
    "FGA",                        
    "FG",                         
    "VORP",                       
    "BPM",                        
    "DBPM",                       
    "OBPM",                       
    "WS",                         
    "DWS"
]

In [3774]:
# X = X[statistics]

### Verwerfen von Features

Das Verwerfen der folgenden Features führt zu leichten Verbesserungen des _Linear Regression Score_.

In [3775]:
X.drop(columns = ["season_start"], inplace=True)

In [3776]:
X.drop(columns = ["3P%"], inplace=True)

## Testdaten

In [3777]:
y = split_df["salary"]

In [3778]:
# X = ((X-X.min())/(X.max()-X.min()))

In [3779]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [3780]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# === === === === === === === === === ===
# Linear Regression

In [3781]:
# With StandardScaler
X_train_linear = X_train_std
X_test_linear = X_test_std

# Without StandardScaler
# X_train_linear = X_train
# X_test_linear = X_test

y_train_linear = y_train
y_test_linear = y_test

In [3782]:
model = LinearRegression().fit(X_train_linear, y_train_linear)
y_pred = model.predict(X_test_linear)

In [3783]:
# print the coefficients
print(model.intercept_)
print(model.coef_)

3049754.12766144
[ 1.38585438e+06  1.60205532e+05  2.85769512e+05  1.03535252e+06
 -2.29149879e+05  5.51319043e+05 -9.32846701e+05  1.00224174e+05
  4.46686035e+04 -5.81012187e+05 -2.00744312e+04 -7.20844196e+05
 -3.93138278e+05  7.93564831e+05  1.68353727e+05 -9.51419778e+04
 -8.47836047e+04  1.58460828e+05  4.85721963e+05  2.27386398e+05
  5.37086960e+05  6.32677807e+04 -6.88064073e+05  7.64761538e+05
  2.85196789e+03  5.64549805e+05 -2.51275782e+05 -3.33687253e+17
  1.51325563e+17  2.75622649e+05 -1.09666020e+17 -4.66677761e+16
 -1.65543422e+17 -1.29485664e+17 -6.79760000e+04 -4.61764000e+05
 -1.54368626e+17  9.18311500e+05 -3.29140000e+04  2.52400532e+17
  5.48563767e+17 -7.75702900e+17  4.84533875e+05 -4.70312000e+05
  2.42412000e+05 -4.63136000e+05 -5.62048000e+05  6.91508919e+17
 -5.42138762e+17 -5.42138762e+17  5.20742658e+17  5.20742658e+17
  3.81251313e+17  3.38805492e+16  3.82500694e+17 -3.45880874e+17
 -3.54131682e+17 -3.41534369e+17 -3.39589513e+17 -3.39098631e+17
  2.6287

In [3784]:
pd.DataFrame(abs(df.corr().salary).sort_values(ascending = False))

Unnamed: 0,salary
salary,1.000000
WS,0.426963
VORP,0.415971
PTS,0.414596
DRB,0.408879
...,...
team_MIN,0.001989
team_NOK,0.001679
team_DET,0.000843
team_CLE,0.000121


In [3785]:
zipped = list(zip(X_train.columns, model.coef_))
data = pd.DataFrame(zipped, columns=['feature', 'coef'])
data = data.reindex(data.coef.abs().sort_values(ascending = False).index)

In [3786]:
data.head(40)

Unnamed: 0,feature,coef
41,TRB,-7.757029e+17
47,PTS,6.915089e+17
40,DRB,5.485638e+17
49,attended_high_school_True,-5.421388e+17
48,attended_high_school_False,-5.421388e+17
50,drafted_player_False,5.207427e+17
51,drafted_player_True,5.207427e+17
54,shoots_Right,3.825007e+17
52,shoots_Left,3.812513e+17
56,position_PF,-3.541317e+17


In [3787]:
data.tail(40)

Unnamed: 0,feature,coef
66,team_CHO,8667943000000000.0
83,team_NOK,7552912000000000.0
74,team_KCK,5344308000000000.0
0,season_end,1385854.0
3,age,1035353.0
6,MP,-932846.7
37,FTA,918311.5
13,TRB%,793564.8
23,OBPM,764761.5
11,ORB%,-720844.2


In [3788]:
print('coefficient of determination: {}'.format(model.score(X_test_linear, y_test_linear)))

coefficient of determination: 0.5143331429780994


In [3789]:
linear_score = round(explained_variance_score(np.array(y_test_linear), y_pred) * 100, 2)
print('linear regression score: {}%'.format(linear_score))

linear regression score: 51.46%


In [3790]:
print(mean_squared_error(np.array(y_test_linear), y_pred))

7159946649606.725


# === === === === === === === === === ===
# KNN

In [3791]:
X_train_knn = X_train_std
X_test_knn = X_test_std

y_train_knn = y_train
y_test_knn = y_test

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [3792]:
first = [50, 75, 100, 125, 150, 175, 200]
second = [100, 112, 125, 127, 150]
third = [100, 106, 112, 119, 125]
fourth = [111, 112, 113]
fifth = [112, 113, 114, 115]
sixth = [113]

param_grid = {
    'n_neighbors': sixth,
    'metric': ['euclidean', 'manhattan'],
    'weights': ['uniform', 'distance'],
}

In [3793]:
classifier_grid = GridSearchCV(
    KNeighborsClassifier(),
    param_grid,
    verbose = 1,
    cv = 3,
    n_jobs = -1
)

In [3794]:
classifier_grid = classifier_grid.fit(X_train_knn, y_train_knn)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:   24.8s remaining:    5.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   25.3s finished


In [3795]:
classifier_grid.best_params_

{'metric': 'manhattan', 'n_neighbors': 113, 'weights': 'distance'}

In [3796]:
classifier_grid.best_score_

0.024646168862859932

In [3797]:
round(classifier_grid.best_score_ * 100, 2)

2.46

In [3798]:
classifier_grid

GridSearchCV(cv=3, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [113],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [3799]:
pd.DataFrame(classifier_grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.36101,0.037275,8.4794,0.842835,euclidean,113,uniform,"{'metric': 'euclidean', 'n_neighbors': 113, 'w...",0.010615,0.015739,0.013177,0.013177,0.002092,4
1,0.355424,0.042956,15.839344,0.559612,euclidean,113,distance,"{'metric': 'euclidean', 'n_neighbors': 113, 'w...",0.02123,0.02489,0.021962,0.022694,0.001581,2
2,0.307032,0.053458,7.99405,0.80629,manhattan,113,uniform,"{'metric': 'manhattan', 'n_neighbors': 113, 'w...",0.013177,0.017204,0.018668,0.016349,0.002321,3
3,0.323128,0.110557,12.658779,0.627495,manhattan,113,distance,"{'metric': 'manhattan', 'n_neighbors': 113, 'w...",0.02306,0.025256,0.025622,0.024646,0.001131,1


https://towardsdatascience.com/the-basics-knn-for-classification-and-regression-c1e8a6c955

--> Letzter Absatz

# === === === === === === === === === ===
# Neural Network

In [3804]:
X_train_nn = X_train_std
X_test_nn = X_test_std

y_train_nn = y_train
y_test_nn = y_test

In [3808]:
first = [
    (1,),  (8,), (16,), (32,), (64,), (128,), (256,), (512,),
    (1, 1),  (8, 8), (16, 16), (32, 32), (64, 64), (128, 128), (256, 256), (512, 512),
    (1, 1, 1),  (8, 8, 8), (16, 16, 16), (32, 32, 32), (64, 64, 64), (128, 128, 128), (256, 256, 256), (512, 512, 512),
]

In [3805]:
parameter_space = {
    'hidden_layer_sizes': [
        (1,),  (8,), (16,), (32,), (64,), (128,), (256,), (512,),
        (1, 1),  (8, 8), (16, 16), (32, 32), (64, 64), (128, 128), (256, 256), (512, 512),
        (1, 1, 1),  (8, 8, 8), (16, 16, 16), (32, 32, 32), (64, 64, 64), (128, 128, 128), (256, 256, 256), (512, 512, 512),
    ],
    'activation': ['tanh', 'relu', 'logistic', 'identity'],
}

In [3806]:
mlp = MLPClassifier(max_iter=200)
clf = GridSearchCV(mlp, parameter_space, cv=3, scoring='accuracy')
clf.fit(X_train_nn, y_train_nn)



GridSearchCV(cv=3, error_score=nan,
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_fun=15000,
                                     max_iter=200, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state...
             param_grid={'activation': ['tanh', 'relu', 'logistic', 'identity'],
                         'hidden_layer_sizes': [(1,), (8,), (16,), (32,), (64,),
                                                (128,), (256,), (512,), (1, 1),
                                        

In [3807]:
print('Best parameters found:\n', clf.best_params_)

Best parameters found:
 {'activation': 'logistic', 'hidden_layer_sizes': (32, 32)}


In [None]:
"""

Best parameters found:
 {'activation': 'logistic', 'hidden_layer_sizes': (32, 32)}
 
"""

In [3809]:
clf.best_score_

0.03038067349926794

In [3813]:
predictions = clf.predict(X_test_nn)
predictions

array([ 473604,  845059, 1000000, ..., 1206600, 1000000, 2020200])

In [3816]:
print(confusion_matrix(y_test_nn, predictions))
print(classification_report(y_test_nn, predictions))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]]
              precision    recall  f1-score   support

        2853       0.00      0.00      0.00         1
        6140       0.00      0.00      0.00         1
        8819       0.00      0.00      0.00         1
        8950       0.00      0.00      0.00         1
       15800       0.00      0.00      0.00         1
       15982       0.00      0.00      0.00         1
       17546       0.00      0.00      0.00         1
       20000       0.00      0.00      0.00         1
       20133       0.00      0.00      0.00         1
       23852       0.00      0.00      0.00         1
       24300       0.00      0.00      0.00         1
       25000       0.00      0.00      0.00         3
       26007       0.00      0.00      0.00         1
       26316       0.00      0.00      0.00         1
       28191       0.00      0.00      0.00         1
       29843   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
