In [None]:
# importing modules
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing, cross_validation
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from scipy.stats import iqr

In [None]:
# Getting data
skaters_url = 'http://www.nhl.com/stats/rest/skaters?isAggregate=false&reportType=basic&isGame=false&reportName=skatersummary&sort=[{%22property%22:%22points%22,%22direction%22:%22DESC%22},{%22property%22:%22goals%22,%22direction%22:%22DESC%22},{%22property%22:%22assists%22,%22direction%22:%22DESC%22}]&cayenneExp=gameTypeId=2%20and%20seasonId%3E=20172018%20and%20seasonId%3C=20172018'

response = requests.get(skaters_url)
response.raise_for_status()
playerData = pd.DataFrame(response.json()['data'])
salariesData = pd.read_csv('Salaries.csv')

In [None]:
# Checking out data features
list(playerData)

In [None]:
# Dropping goalies and defenceman 
playerData = playerData[playerData.playerPositionCode != 'D']
playerData = playerData[playerData.playerPositionCode != 'G']

In [None]:
# Dropping features that shouldn't affect salary and checking that they were dropped
playerData = playerData.drop(columns=['playerBirthCity', 'playerBirthCountry', 'playerBirthStateProvince',
                                      'playerInHockeyHof', 'playerFirstName', 'playerLastName', 'playerNationality', 'playerDraftYear',
                                      'playerDraftRoundNo', 'playerTeamsPlayedFor', 'seasonId', 'playerPositionCode', 'playerShootsCatches',
                                      'playerName', 'faceoffWinPctg', 'gameWinningGoals', 'points', 'penaltyMinutes', 'shiftsPerGame',
                                      'shPoints', 'shGoals', 'playerWeight', 'playerHeight'])
list(playerData)


In [None]:
# Removing inactive players and checking to make sure all the data is active players
playerData = playerData.query('playerIsActive != 0')
playerData = playerData.drop(columns=['playerIsActive'])

In [None]:
# Setting the index of the dataframe to the players unique nhl player id
playerData = playerData.set_index('playerId')
playerData.head()

In [None]:
# Checking for any missing values
playerData.isna().sum()

In [None]:
# We can see that 69 forwards went undrafted so we will check to see
# the max draft number
maxDraftNum = playerData['playerDraftOverallPickNo'].max()
print(maxDraftNum)

In [None]:
# Filling undrafted players draft number with max draft number +1 
playerData['playerDraftOverallPickNo'] = playerData['playerDraftOverallPickNo'].fillna(maxDraftNum + 1)

In [None]:
# Checking to see if the missing values were filled in
playerData.isna().sum()

In [None]:
# Checking format and data type of playerBirtDate
print(playerData.playerBirthDate.dtype)
print(playerData[:1].playerBirthDate)

In [None]:
# Adding age column and roughly calculating age based off birth year *note player data is from 2017-2018 season
playerData['playerBirthDate'] = playerData['playerBirthDate'].apply(lambda x: x[:4])
playerData['age'] = 2017 - playerData['playerBirthDate'].astype(int)
playerData = playerData.drop(columns=['playerBirthDate'])
playerData.head()

In [None]:
# Dropping players with less than 10 games played
playerData = playerData[playerData['gamesPlayed'] > 10]
playerData['gamesPlayed'].min()

In [None]:
# Calculating goals per game
playerData['goalsPerGame'] = playerData['goals']/playerData['gamesPlayed']
playerData['assistsPerGame'] = playerData['assists']/playerData['gamesPlayed']


playerData = playerData.drop(columns=['gamesPlayed'])

In [None]:
# Setting the index for salary dataframe to the players unique nhl player id
salariesData = salariesData.set_index('NHLid')
salariesData.head()

In [None]:
# Checking for missing salaries
salariesData.isna().sum()

In [None]:
# Dropping missing salaries also getting basic stats on the salaries
# Note that Q3 is at 4,000,000
salariesData.dropna(inplace=True)
print(salariesData.isna().sum())
print(salariesData.describe())

In [None]:
# Calculating high outliers and removing them from data prints the outlier fence just for my info
q3 = 4000000
iqr=iqr(salariesData)
fence = (q3 + (1.5*iqr))
print(fence)
salariesData = salariesData[salariesData.Salary < fence]


In [None]:
# Checking to see if outliers are dropped
salariesData.max()

In [None]:
# Rounding salaries to nearest 100,000
salariesData['Salary'] = salariesData.Salary.round(decimals = -5)
salariesData.head(20)

In [None]:
# Joining the two data sets and checking to see if they joined properly
data = playerData.join(salariesData)
list(data)

In [None]:
# Checking a known players salary to make sure everything worked by indexing to their player id
data.loc[[8474625]]

In [None]:
# Checking how many players have missing salaries
data['Salary'].isna().sum()

In [None]:
# Dropping rows with missing salary data and checking to make sure we have no more missing values
data = data.dropna(subset=['Salary'])
data.isna().sum()

In [None]:
# Checking data types
data.dtypes

In [None]:
# Basic statistics on salary
data.Salary.describe()

In [None]:
# Getting info on highest paid player in our dataset
data.loc[data['Salary'].idxmax()]

In [None]:
# Histogram for number of players for a salary range with bins set to 20
# to get a little bit more detail
plt.hist(data['Salary'], bins=20)

In [None]:
# Scatter plot on salary and a players points per game
plt.scatter(x=data.pointsPerGame, y=data.Salary, marker='.')
plt.xlabel('PPG')
plt.ylabel('Salary')

In [None]:
# Scatter plot on salary and a players plus minus
plt.scatter(x=data.plusMinus, y=data.Salary, marker='.')
plt.xlabel('+/-')
plt.ylabel('Salary')

In [None]:
# A players +/- doesnt seem to have any relationship to salary
data = data.drop(columns=['plusMinus'])
list(data)

In [None]:
# Scatter plot on salary and a players time on ice
plt.scatter(x=data.timeOnIcePerGame, y=data.Salary, marker='.')
plt.xlabel('TOI')
plt.ylabel('Salary')

In [None]:
# Scatter plot on salary and a players assists
plt.scatter(x=data.assists, y=data.Salary, marker='.')
plt.xlabel('Assists')
plt.ylabel('Salary')

In [None]:
# Scatter plot on salary and a players goals
plt.scatter(x=data.goals, y=data.Salary, marker='.')
plt.xlabel('Goals')
plt.ylabel('Salary')

In [None]:
# Scatter plot on salary and a players goals per game
plt.scatter(x=data.goalsPerGame, y=data.Salary, marker='.')
plt.xlabel('Goals/Game')
plt.ylabel('Salary')

In [None]:
# Scatter plot on salary and a players assists per game
plt.scatter(x=data.assistsPerGame, y=data.Salary, marker='.')
plt.xlabel('Assists/Game')
plt.ylabel('Salary')

In [None]:
# Scatter plot on salary and a players OT
plt.scatter(x=data.otGoals, y=data.Salary, marker='.')
plt.xlabel('OT Goals')
plt.ylabel('Salary')

In [None]:
# All players are put into 5 categories here so otGoals doesnt help much
data = data.drop(columns=['otGoals'])
list(data)

In [None]:
# Scatter plot on salary and a players draft number
plt.scatter(x=data.playerDraftOverallPickNo, y=data.Salary, marker='.')
plt.xlabel('Draft Number')
plt.ylabel('Salary')

In [None]:
# Draft number doesnt have much of a relationship with salary
data = data.drop(columns=['playerDraftOverallPickNo'])
list(data)

In [None]:
# Scatter plot on salary and a PP points
plt.scatter(x=data.ppPoints, y=data.Salary, marker='.')
plt.xlabel('PP Points')
plt.ylabel('Salary')

In [None]:
# Scatter plot on salary and a PP goals
plt.scatter(x=data.ppGoals, y=data.Salary, marker='.')
plt.xlabel('PP Goals')
plt.ylabel('Salary')

In [None]:
data = data.drop(columns=['ppGoals'])
list(data)

In [None]:
# Scatter plot on salary and a players age
plt.scatter(x=data.age, y=data.Salary, marker='.')
plt.xlabel('Age')
plt.ylabel('Salary')

In [None]:
# Scatter plot on salary and shooting %
plt.scatter(x=data.shootingPctg, y=data.Salary, marker='.')
plt.xlabel('Shooting %')
plt.ylabel('Salary')

In [None]:
data = data.drop(columns=['shootingPctg'])
list(data)

In [None]:
# Scatter plot on salary and shots
plt.scatter(x=data.shots, y=data.Salary, marker='.')
plt.xlabel('Shots')
plt.ylabel('Salary')

In [None]:
data.reset_index(drop=True) 
X = data.drop(columns=['Salary'])
y = data['Salary']
X = preprocessing.scale(X)

In [None]:
# Splitting our dataframe into our taining and test sets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

In [None]:
# Checking data to make sure it split right
print(X_train.shape)
print(X_test.shape)

print(y_train.shape)
print(y_test.shape)

In [None]:
# Saving data to csv
data.to_csv('NHL_Data.csv', encoding='utf-8')

In [None]:
rf = RandomForestClassifier(n_estimators=100,max_features=None)

rf.fit(X_train, y_train)

pred = rf.predict(X_test)

print(rf.score(X_test, y_test))



plt.plot(pred[:20])
plt.plot(y_test.values[:20])
plt.xticks([])
plt.legend(['pred', 'test val'])

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

lr.score(X_test, y_test)

In [None]:
res = pd.DataFrame({'Predicted':pred,'Actual':y_test})
res = res.reset_index()
res.head(30)

In [None]:
plt.plot(pred[:20])
plt.plot(y_test.values[:20])
plt.xticks([])
plt.legend(['pred', 'test val'])

In [None]:
logr = LogisticRegression()
logr.fit(X_train, y_train)
logr.score(X_test, y_test)