In [1]:
import pandas as pd
import process_data

# Preprocess data skaters

In [2]:
# list of seasons to consider
# year_list = ['20142015', '20152016', '20162017', '20172018', '20182019', '20192020', '20202021', '20212022', '20222023', '20232024']
year_list = ['20202021', '20212022', '20222023', '20232024']

In [3]:
# find list of teams active between 2014 and 2024
team_id_list = []
for season in year_list:
    for id in process_data.get_season_teams(season):
        if id not in team_id_list:
            team_id_list.append(id)
team_id_list

['EDM',
 'BOS',
 'TOR',
 'COL',
 'CHI',
 'WPG',
 'PIT',
 'VGK',
 'FLA',
 'STL',
 'NYR',
 'CAR',
 'WSH',
 'MIN',
 'DAL',
 'LAK',
 'VAN',
 'SJS',
 'CGY',
 'TBL',
 'NYI',
 'MTL',
 'CBJ',
 'ARI',
 'PHI',
 'BUF',
 'DET',
 'OTT',
 'NJD',
 'ANA',
 'NSH',
 'SEA']

In [4]:
len(team_id_list)

32

In [5]:
# find list of player ids for players who played a game between 2010 and 2020
player_id_list = []
for team in team_id_list:
    for season in year_list:
        for id in process_data.get_all_player_ids(season, team):
            if id not in player_id_list:
                player_id_list.append(id)
player_id_list

[8471707,
 8471729,
 8474068,
 8474589,
 8475163,
 8475178,
 8475179,
 8475197,
 8476326,
 8476454,
 8476457,
 8476886,
 8476913,
 8476915,
 8477498,
 8477934,
 8478021,
 8478402,
 8478451,
 8478452,
 8479344,
 8479466,
 8479977,
 8480802,
 8480803,
 8480946,
 8481638,
 8481813,
 8470281,
 8473544,
 8474089,
 8474098,
 8475169,
 8475786,
 8476495,
 8476879,
 8476967,
 8477943,
 8477998,
 8478442,
 8478585,
 8479338,
 8479347,
 8480041,
 8481598,
 8474218,
 8475218,
 8475760,
 8476850,
 8477406,
 8479576,
 8480011,
 8480468,
 8482077,
 8470621,
 8474040,
 8474641,
 8475842,
 8477015,
 8477454,
 8479442,
 8480274,
 8481534,
 8470638,
 8471276,
 8473419,
 8474000,
 8475186,
 8475225,
 8475735,
 8475745,
 8475780,
 8475791,
 8475797,
 8476191,
 8476374,
 8476422,
 8476891,
 8477320,
 8477365,
 8477508,
 8477941,
 8477956,
 8478075,
 8478131,
 8478415,
 8478443,
 8478468,
 8478485,
 8478498,
 8478888,
 8479325,
 8479365,
 8479546,
 8480001,
 8480021,
 8480901,
 8480944,
 8482072,
 8473422,


In [6]:
len(player_id_list)

1351

In [7]:
# extract data about all players who played between 2010 and 2020, including personal data and stats from every season in the NHL during that period
player_stats = []
for player_id in player_id_list:
    try:
        pl_class = process_data.load_player(player_id, year_list)
        player_stats.append(pl_class)
    except Exception as e:
        print(e)
        continue


KeyboardInterrupt



In [None]:
# total number of players extracted
len(player_stats)

# Clean data

In [None]:
# restrict to players who played at least 3 seasons and 100 total games
reduced_player_stats = []
for temp in player_stats:
    games = 0
    years = 0
    for year in year_list:
        if year in temp.seasons.keys():
            games += temp.seasons[year].n_games_played
            years += 1
    if years >= 3 and games > 100:
        reduced_player_stats.append(temp)

In [None]:
# number of players with at least 3 seasons and 100 games
len(reduced_player_stats)

In [None]:
# create data for prediction, with every possible set of 2 consecutive seasons to predict the 3rd one
split_data = []
for player in reduced_player_stats:
    for i in range(len(year_list) - 2):
        years = year_list[i:i+3]
        if years[2] in player.seasons.keys():
            data = {'name': player.name,
                    'role': player.role,
                    'country': player.country,
                    'age': player.age,
                    'id': player.id,
                    'height': player.height,
                    'weight': player.weight
                    }
            data = {**data, **process_data.get_year_data_skaters(player, years[0], '1'), **process_data.get_year_data_skaters(player, years[1], '2')}
            data['season_1'] = years[0]
            data['season_2'] = years[1]
            data['season_3'] = years[2]
            data['ppg_3'] = player.get_ratio_season_points(years[2])
            split_data.append(data)

In [None]:
# transform the data into a pandas dtaframe for processing
df = pd.DataFrame(split_data)
df

In [None]:
df = process_data.process_data_skaters(df)

In [None]:
# select only relevant columns
df_final = df[['games_1', 'games_2', 'goals_1', 'goals_2',
               'height', 'pim_1',
               'pim_2', 'plus_minus_1', 'plus_minus_2', 'role',
               'ppg_3', 'shots_1', 'shots_2', 'time_1',
               'time_2', 'weight', 'points_1', 'points_2', 'age']]

In [None]:
# transform position columns into one-hot encoded features
df_final = pd.get_dummies(df_final, columns=['role'])

In [None]:
# restrict to lines with some games played in one of the two seasons at least
df_final = df_final[(df_final['games_1']!=0)|(df_final['games_2']!=0)]

In [None]:
# replace NaN by zero because they come from seasons where the player did not play
df_final = df_final.fillna(0)

In [None]:
df_final

# Predict points per game

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [None]:
# split label column from the features
X = df_final.drop(['ppg_3'], axis=1)
y = df_final['ppg_3']

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [None]:
print('Number of training data:', len(X_train))
print('Number of testing data:', len(X_test))

In [None]:
# find mean and std of columns that were not normalized by the number of games
standardization = {}
for col in ['height', 'weight', 'age', 'plus_minus_1', 'plus_minus_2', 'time_1', 'time_2']:
    mu = X_train[col].mean()
    sig = X_train[col].std()
    standardization[col] = {'mu': mu, 'sig': sig}

standardization

In [None]:
# function to normalize the data
def normalize_data(data):
    for col in ['height', 'weight', 'age', 'plus_minus_1', 'plus_minus_2', 'time_1', 'time_2']:
        data[col] = (data[col] - standardization[col]['mu']) / standardization[col]['sig']
    return data

In [None]:
# normalize the data 
X_train = normalize_data(X_train)
X_test = normalize_data(X_test)

In [None]:
# use last season's points as benchmark prediction
print('RMSE:', root_mean_squared_error(X_test['points_2'], y_test))

In [None]:
# train a linear regression model and evaluate on test data
model = LinearRegression().fit(X_train, y_train)

print('RMSE:', root_mean_squared_error(model.predict(X_test), y_test))

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.scatter(y_test, model.predict(X_test), edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k-', lw=2)
ax.set_xlabel('Target Pts/gm', size='x-large')
ax.set_ylabel('Predicted Pts/gm', size='x-large')
plt.savefig('plot.png')

In [None]:
# extract data for the last two seasons to predict the new one
pred_data = []
for player in player_stats:
        years = ['20222023', '20232024', '20242025']
        data = {'name': player.name,
                'role': player.role,
                'country': player.country,
                'age': player.age,
                'id': player.id,
                'height': player.height,
                'weight': player.weight
                }
        data = {**data, **process_data.get_year_data_skaters(player, years[0], '1'), **process_data.get_year_data_skaters(player, years[1], '2')}
        data['season_1'] = years[0]
        data['season_2'] = years[1]
        data['season_3'] = years[2]
        pred_data.append(data)

In [None]:
df_pred = pd.DataFrame(pred_data)
df_pred

In [None]:
df_pred = process_data.process_data_skaters(df_pred)
df_pred = df_pred[df_pred['role']!='N/A']

In [None]:
df_pred_final = df_pred[['name', 'games_1', 'games_2', 'goals_1', 'goals_2',
               'height', 'pim_1',
               'pim_2', 'plus_minus_1', 'plus_minus_2', 'role',
               'shots_1', 'shots_2', 'time_1',
               'time_2', 'weight', 'points_1', 'points_2', 'age']]

df_pred_final = df_pred_final.fillna(0)
df_pred_final = pd.get_dummies(df_pred_final, columns=['role'])

df_pred_final = normalize_data(df_pred_final)

In [None]:
predictions = df_pred_final[['name']]
predictions['ppg'] = model.predict(df_pred_final.drop('name', axis=1))
predictions = predictions.groupby(['name']).max().reset_index()

In [None]:
predictions = predictions.sort_values('ppg', ascending=False)
predictions

In [None]:
final_player_list = []
for player_name in predictions['name'].unique():
    for player in reduced_player_stats:
        if player.name == player_name:
            player.predict_points = predictions.loc[predictions['name']==player_name]['ppg'].item()
            if player.predict_points > 0:
                final_player_list.append(player)

In [None]:
final_player_list = final_player_list[:]
player_roles = [p.role for p in final_player_list]
print("Players Composition", player_roles.count('A'), 'A,', player_roles.count('D'), 'D,', player_roles.count('G'), 'G')


# Branch-and-Bound Algorithm to choose the best pool team

In [None]:
from pool_classifier import team_optimization_branch_and_bound, solve_problem

In [None]:
# resolving using LP livrary to compare results
selected_players, total_ppg, total_salary = solve_problem(final_player_list.copy())
print("Selected Players:", selected_players)
print("Total PPG:", total_ppg)
print("Total Salary:", total_salary)

In [None]:
# algo branch-and-bound custom made
best_team, best_ppg = team_optimization_branch_and_bound(final_player_list.copy())
print("Selected Players:", [p.name for p in best_team])
print("Total PPG:", best_ppg)
print("Total Salary:", sum(p.salary for p in best_team))
team_roles = [p.role for p in best_team]
print("Team Composition", team_roles.count('A'), 'A,', team_roles.count('D'), 'D,', team_roles.count('G'), 'G')

results = []
for player in best_team:
    results.append([player.id, player.name, player.salary, player.predict_points])
pd.DataFrame(results, columns=['id', 'name', 'salary', 'ppg']).to_csv('meilleure_solution.csv')