In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import plotly
import plotly.plotly as py
import plotly.figure_factory as ff
from plotly.graph_objs import *
import plotly.tools as tls

#API plotly, if you do not have an account -> go to https://plot.ly
plotly.tools.set_credentials_file(username='xxxxxxx', api_key='xxxxxxxxxx')

#loading dataset
df = pd.read_excel('2017-18_players_statistics.xlsx')

#adding a feature where a players name is followed by the team he plays for
df['player_team'] = (df['PLAYER'].astype(str) + (' - ')) + df['TEAM']
print(df.head())
print(list(df))

In [44]:
# X represents all the independent variables and y represent the dependent variable, in our case, the name of the player
X = df.drop(['PLAYER', 'TEAM', 'player_team'], axis=1)
y = df['player_team']
#scaling the data in order to improve the PCA performance
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
#creating PCA of two components and fitting the model
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)

In [1]:
#implementing function available from https://plot.ly/ipython-notebooks/principal-component-analysis/
name_teams = list(df['player_team'])

traces = []
for name in name_teams:

    trace = Scatter(
        x=Y_sklearn[y==name,0],
        y=Y_sklearn[y==name,1],
        mode='markers',
        name=name,
        marker=Marker(
            size=12,
            line=Line(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5),
            opacity=0.8))
    traces.append(trace)
data = Data(traces)
layout = Layout(xaxis=XAxis(title='PC1', showline=False),
                yaxis=YAxis(title='PC2', showline=False))
fig = Figure(data=data, layout=layout)
py.iplot(fig)

In [1]:
#plotting the scale for the space interpretation
%matplotlib inline
import matplotlib.pyplot as plt

# #### Plotting the magnitude of each feature value for the first two principal components
X.feature_names = list(X)
fig = plt.figure(figsize=(10, 4))
plt.imshow(sklearn_pca.components_, interpolation = 'none', cmap = 'gray')
feature_names = list(X.feature_names)

plt.gca().set_xticks(np.arange(-.5, len(X.feature_names)));
plt.gca().set_yticks(np.arange(0.5, 2));
plt.gca().set_xticklabels(X.feature_names, rotation=90, ha='left', fontsize=12);
plt.gca().set_yticklabels(['First PC', 'Second PC'], va='bottom', fontsize=12);

plt.colorbar(orientation='horizontal', ticks=[sklearn_pca.components_.min(), 0, 
                                              sklearn_pca.components_.max()], pad=0.65)

In [49]:
#The PCA analysis has revealed interesting findings to be further investigated:
    #James Harden is not surrounded by other players -> perhaps a reason why he got the mvp this year?
    #So far, he seems to be the most unique player in this season. Let's investigate further whether it is because of positive or negative statistics:
    # subset only the few players who are closes to his statistics and running PCA again in order to have a closer look on the results of the two components 

In [1]:
#subset of relevant players
players_close_to_harden = ['Kevin Durant - GSW','James Harden - HOU', 'Russell Westbrook - OKC', 'Stephen Curry - GSW', 'LeBron James - CLE', 'Damian Lillard - POR']
df3 = df.loc[df['player_team'].isin(players_close_to_harden)]
df3 = df3.reset_index(drop=True)
df3

In [23]:
# X represents all the independent variables and y represent the dependent variable, in our case, the name of the player
X = df3.drop(['PLAYER', 'TEAM', 'player_team', 'AGE'], axis=1)
y = df3['player_team']

#scaling the data in order to improve the PCA performance
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

#creating PCA of two components and fitting the model
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)

In [1]:
#implementing function available from https://plot.ly/ipython-notebooks/principal-component-analysis/
name_teams = list(df3['player_team'])

traces = []
for name in name_teams:

    trace = Scatter(
        x=Y_sklearn[y==name,0],
        y=Y_sklearn[y==name,1],
        mode='markers',
        name=name,
        marker=Marker(
            size=12,
            line=Line(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5),
            opacity=0.8))
    traces.append(trace)
data = Data(traces)
layout = Layout(xaxis=XAxis(title='PC1', showline=False),
                yaxis=YAxis(title='PC2', showline=False))
fig = Figure(data=data, layout=layout)
py.iplot(fig)

In [1]:
#plotting the scale for the space interpretation
%matplotlib inline
import matplotlib.pyplot as plt

# #### Plotting the magnitude of each feature value for the first two principal components
X.feature_names = list(X)
fig = plt.figure(figsize=(10, 4))
plt.imshow(sklearn_pca.components_, interpolation = 'none', cmap = 'gray')
feature_names = list(X.feature_names)

plt.gca().set_xticks(np.arange(-.5, len(X.feature_names)));
plt.gca().set_yticks(np.arange(0.5, 2));
plt.gca().set_xticklabels(X.feature_names, rotation=90, ha='left', fontsize=12);
plt.gca().set_yticklabels(['First PC', 'Second PC'], va='bottom', fontsize=12);

plt.colorbar(orientation='horizontal', ticks=[sklearn_pca.components_.min(), 0, 
                                              sklearn_pca.components_.max()], pad=0.65)