#Проект работы по прогнозированию результативности игроков NHL

In [1]:
# Подключение библиотек
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Inline plotting
%matplotlib inline
# показать все колонки df
pd.set_option('display.max_columns', None)

In [2]:
# монтирование Google disk
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# загрузка файла, содержащего статистику игроков link - https://drive.google.com/file/d/1lqiVkcCoZdlPlM5WOj8jdtbeqQQd7OJH/view?usp=drive_link
stats_df = pd.read_csv('/content/drive/My Drive/BigData_proj/NHL_Players_Statistics.csv', sep=';')


In [4]:
stats_df.head()

Unnamed: 0,Name,Date_of_birth,SEASON_year,SEASON,TEAM,Games_Played,Goals,Assists,Points,PlusMinus_Ratings,Penalty_Minutes,Shots_on_Goal,Shooting_Percentage,PowerPlay_Goals,PowerPlay_Assists,Short_Goals,Short_Assists,Game_Winning_Goals,Game_Tying_Goals,Time_on_Ice_per_Game,Production,Number,Games_Started,Wins,Losses,Ties,Overtime_Losses,Goals_Against,Goals_Against_Average,Shots_Against,Saves,Save_Percentage,Shutouts,Position,Height,Weight,Body_mass_index,Place_of_birth,Age,Experience
0,Aaron Downey,1974-08-27,2000,'99-'00,BOS,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8:31,0:00,32.0,,,,,,,,,,,,Right_wing,185,98,28.6,"Shelburne, Ontario",26,1
1,Aaron Downey,1974-08-27,2001,'00-'01,CHI,3,0.0,0.0,0.0,-1.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5:30,0:00,32.0,,,,,,,,,,,,Right_wing,185,98,28.6,"Shelburne, Ontario",27,2
2,Aaron Downey,1974-08-27,2002,'01-'02,CHI,36,1.0,0.0,1.0,-2.0,76.0,10.0,10.0,0.0,0.0,0.0,0.0,1.0,0.0,5:06,183:38,32.0,,,,,,,,,,,,Right_wing,185,98,28.6,"Shelburne, Ontario",28,3
3,Aaron Downey,1974-08-27,2003,'02-'03,DAL,43,1.0,1.0,2.0,1.0,69.0,14.0,7.1,0.0,0.0,0.0,0.0,0.0,0.0,4:47,102:40,32.0,,,,,,,,,,,,Right_wing,185,98,28.6,"Shelburne, Ontario",29,4
4,Aaron Downey,1974-08-27,2004,'03-'04,DAL,37,1.0,1.0,2.0,2.0,77.0,11.0,9.1,0.0,0.0,0.0,0.0,1.0,0.0,4:30,83:19,32.0,,,,,,,,,,,,Right_wing,185,98,28.6,"Shelburne, Ontario",30,5


In [5]:
np.sort(stats_df.SEASON_year.unique())

array([1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986,
       1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997,
       1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2007, 2008, 2009,
       2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

## Гипотеза о возможности прогноза кол-ва очков на следующий сезон по показателям текущего сезона

In [6]:
# Убираем фичи, которые предположительно не окажут влияния на точность прогноза

stats_df_second = stats_df.copy()
stats_df_second = stats_df_second.drop(['Date_of_birth', 'SEASON', 'TEAM', 'Shooting_Percentage',
                     'PowerPlay_Goals', 'PowerPlay_Assists', 'Short_Goals', 'Short_Assists',
                     'Game_Winning_Goals', 'Game_Tying_Goals', 'Production', 'Number',
                     'Games_Started', 'Wins', 'Losses', 'Ties', 'Overtime_Losses',
                     'Goals_Against', 'Goals_Against_Average', 'Shots_Against',
                     'Saves', 'Save_Percentage', 'Shutouts', 'Height', 'Weight',
                     'Body_mass_index', 'Place_of_birth', 'Age', 'Time_on_Ice_per_Game'], axis=1)

In [14]:
stats_df_second['Position'].unique()

array(['Forward', 'Defence'], dtype=object)

In [11]:
# удаляем вратарей из выборки
stats_df_second = stats_df_second[stats_df_second['Position']!='Goaltender']

In [13]:
# меняем значения 'Right_wing', 'Left_wing', 'Centre' на 'Forward'
stats_df_second.loc[(stats_df_second['Position'] == 'Right_wing') | (stats_df_second['Position'] == 'Left_wing') |
                    (stats_df_second['Position'] == 'Centre'), 'Position']  = 'Forward'

In [18]:
# суммируем значение по столбцам, когда в одном сезоне игрок выступал за разные клубы
stats_df_second_gr = stats_df_second.groupby(['Name', 'SEASON_year']).agg({'Games_Played': ['sum'], 'Goals': ['sum'], 'Assists': ['sum'], 'Points': ['sum'],
                                                      'PlusMinus_Ratings': ['sum'], 'Penalty_Minutes': ['sum'], 'Shots_on_Goal': ['sum'], 'Position': ['first'],
                                                      'Experience':['max'] })

In [23]:
# после суммирования преобразуем обратно в нормальный датафрейм

stats_df_second_gr = stats_df_second_gr.droplevel(1, axis=1)
stats_df_second = stats_df_second_gr.reset_index()
stats_df_second.drop('SEASON_year', axis=1, inplace=True)
stats_df_second.head(10)

Unnamed: 0,Name,Games_Played,Goals,Assists,Points,PlusMinus_Ratings,Penalty_Minutes,Shots_on_Goal,Position,Experience
0,Aaron Downey,1,0.0,0.0,0.0,0.0,0.0,0.0,Forward,1
1,Aaron Downey,3,0.0,0.0,0.0,-1.0,6.0,2.0,Forward,2
2,Aaron Downey,36,1.0,0.0,1.0,-2.0,76.0,10.0,Forward,3
3,Aaron Downey,43,1.0,1.0,2.0,1.0,69.0,14.0,Forward,4
4,Aaron Downey,37,1.0,1.0,2.0,2.0,77.0,11.0,Forward,5
5,Aaron Downey,42,3.0,4.0,7.0,2.0,95.0,21.0,Forward,7
6,Aaron Downey,21,1.0,0.0,1.0,-6.0,48.0,10.0,Forward,8
7,Aaron Downey,56,0.0,3.0,3.0,0.0,116.0,15.0,Forward,9
8,Aaron Downey,4,1.0,1.0,2.0,0.0,7.0,2.0,Forward,10
9,Aaron Gagnon,2,0.0,0.0,0.0,0.0,0.0,2.0,Forward,1


In [24]:
# Датафрейм для "сдвига" очков на сезон

stats_df_second_point = stats_df_second[['Name', 'Experience', 'Points']].copy()
stats_df_second_point

Unnamed: 0,Name,Experience,Points
0,Aaron Downey,1,0.0
1,Aaron Downey,2,0.0
2,Aaron Downey,3,1.0
3,Aaron Downey,4,2.0
4,Aaron Downey,5,2.0
...,...,...,...
22452,Zigmund Palffy,8,89.0
22453,Zigmund Palffy,9,59.0
22454,Zigmund Palffy,10,85.0
22455,Zigmund Palffy,11,41.0


In [25]:
# Собственно сам сдвиг за счет пересчета столбца "Опыт в NHL" - 'Experience'

stats_df_second_point['new_exper'] = stats_df_second_point['Experience'] -1
stats_df_second_point.rename(columns = {'Points':'NS_points'}, inplace = True )
stats_df_second_point.drop('Experience', axis=1, inplace=True)
stats_df_second_point.rename(columns = {'new_exper':'Experience'}, inplace = True )
stats_df_second_point.head()

Unnamed: 0,Name,NS_points,Experience
0,Aaron Downey,0.0,0
1,Aaron Downey,0.0,1
2,Aaron Downey,1.0,2
3,Aaron Downey,2.0,3
4,Aaron Downey,2.0,4


In [26]:
# Слияние с базовой таблицей

stats_df_second = stats_df_second.merge(stats_df_second_point, how='left', on=['Name','Experience'])
stats_df_second.head()


Unnamed: 0,Name,Games_Played,Goals,Assists,Points,PlusMinus_Ratings,Penalty_Minutes,Shots_on_Goal,Position,Experience,NS_points
0,Aaron Downey,1,0.0,0.0,0.0,0.0,0.0,0.0,Forward,1,0.0
1,Aaron Downey,3,0.0,0.0,0.0,-1.0,6.0,2.0,Forward,2,1.0
2,Aaron Downey,36,1.0,0.0,1.0,-2.0,76.0,10.0,Forward,3,2.0
3,Aaron Downey,43,1.0,1.0,2.0,1.0,69.0,14.0,Forward,4,2.0
4,Aaron Downey,37,1.0,1.0,2.0,2.0,77.0,11.0,Forward,5,


In [27]:
# Дропаем NaN, с ними ничего не сделать

stats_df_second = stats_df_second[~stats_df_second['NS_points'].isna()]

In [28]:
# дробные к целым

stats_df_second[['Goals', 'Assists', 'Points', 'PlusMinus_Ratings', 'Penalty_Minutes',
         'Shots_on_Goal', 'NS_points']] = stats_df_second[['Goals', 'Assists', 'Points',
         'PlusMinus_Ratings', 'Penalty_Minutes','Shots_on_Goal', 'NS_points']].astype(int)
stats_df_second.head()

Unnamed: 0,Name,Games_Played,Goals,Assists,Points,PlusMinus_Ratings,Penalty_Minutes,Shots_on_Goal,Position,Experience,NS_points
0,Aaron Downey,1,0,0,0,0,0,0,Forward,1,0
1,Aaron Downey,3,0,0,0,-1,6,2,Forward,2,1
2,Aaron Downey,36,1,0,1,-2,76,10,Forward,3,2
3,Aaron Downey,43,1,1,2,1,69,14,Forward,4,2
5,Aaron Downey,42,3,4,7,2,95,21,Forward,7,1


In [29]:
#факторизация имени и позиции

stats_df_second['name_factor'] = pd.factorize(stats_df_second['Name'])[0]
stats_df_second['Position'] = pd.factorize(stats_df_second['Position'])[0]
stats_df_second

Unnamed: 0,Name,Games_Played,Goals,Assists,Points,PlusMinus_Ratings,Penalty_Minutes,Shots_on_Goal,Position,Experience,NS_points,name_factor
0,Aaron Downey,1,0,0,0,0,0,0,0,1,0,0
1,Aaron Downey,3,0,0,0,-1,6,2,0,2,1,0
2,Aaron Downey,36,1,0,1,-2,76,10,0,3,2,0
3,Aaron Downey,43,1,1,2,1,69,14,0,4,2,0
5,Aaron Downey,42,3,4,7,2,95,21,0,7,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
22462,Zigmund Palffy,50,22,28,50,-6,34,168,0,6,66,2505
22463,Zigmund Palffy,64,27,39,66,18,32,186,0,7,89,2505
22464,Zigmund Palffy,73,38,51,89,22,20,217,0,8,59,2505
22465,Zigmund Palffy,63,32,27,59,5,26,161,0,9,85,2505


In [31]:
features = ['Games_Played', 'Goals', 'Assists', 'Points', 'PlusMinus_Ratings', 'Penalty_Minutes', 'Shots_on_Goal', 'Experience', 'Position', 'name_factor']
target = ['NS_points']

In [32]:

train, test = train_test_split(stats_df_second, test_size=0.1)

In [34]:
from sklearn.linear_model import LinearRegression

In [35]:
lin_r = LinearRegression()
lin_r.fit(train[features], train[target])

In [36]:
lin_r.coef_

array([[-1.51739498e-01,  1.87334965e-01,  2.93482895e-01,
         4.80817860e-01, -4.54663313e-03, -3.56560980e-03,
         5.55778951e-02, -3.18429584e-01, -2.07266671e+00,
         2.64351488e-04]])

In [37]:
y_pred = lin_r.predict(test[features])

In [38]:
from sklearn.metrics import mean_squared_error

In [40]:
mse = mean_squared_error(test[target], y_pred)
print(f" mse = {mse}")
print(f" rmse = {np.sqrt(mse)}")

 mse = 192.49941731093364
 rmse = 13.874415926839358


In [41]:
y_pred

array([[38.17484923],
       [16.90677343],
       [ 6.22358457],
       ...,
       [23.32753366],
       [42.50179869],
       [23.22644099]])

In [42]:
test['pred_points'] = y_pred

In [43]:
#test[test['Name'].str.contains('Stam')]
test

Unnamed: 0,Name,Games_Played,Goals,Assists,Points,PlusMinus_Ratings,Penalty_Minutes,Shots_on_Goal,Position,Experience,NS_points,name_factor,pred_points
13659,Mathieu Perreault,62,18,23,41,7,38,129,0,6,41,1542,38.174849
10720,John Slaney,46,2,12,14,-12,14,84,1,6,5,1215,16.906773
13042,Mark Eaton,35,0,3,3,-6,16,22,1,8,3,1479,6.223585
18428,Ryan Jones,81,18,7,25,-5,34,126,0,3,33,2064,23.898355
4780,Dan Hamhuis,59,1,22,23,0,44,82,1,12,13,545,19.648638
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17038,Phil Kessel,82,37,45,82,-10,20,295,0,6,52,1919,74.336644
19475,Shayne Corson,80,16,31,47,-19,209,164,0,8,54,2169,41.284247
20093,Steve Smith,55,4,20,24,30,166,74,1,2,22,2239,23.327534
17682,Rick Nash,44,21,21,42,16,26,176,0,11,39,1988,42.501799
