# Data Preparation

This parts aim to prepare the data before exploring it and building a prediction model. You must execute notebook `data_retrieval.ipynb` before executing this one.

In [13]:
import pandas as pd
import numpy as np
from datetime import datetime

In [44]:
games = pd.read_csv('data/game_scores.csv', parse_dates=['date'])
stats = pd.read_csv('data/games_statistics.csv')
raw_data = games.merge(stats, on='boxscore_url', how='inner')
raw_data_backup = raw_data.copy()

We see the All-Star game is included, we want to discard it as it is not a regular game:

In [45]:
teams = pd.concat([raw_data.home_team, raw_data.away_team], ignore_index=True).unique() #Important to take home + away teams if the algorithm is run early in the season
print(teams)
filtered_teams = ['All Star France', 'All Star Monde']
raw_data = raw_data[~raw_data.home_team.isin(filtered_teams)]
teams = list(filter(lambda team: team not in filtered_teams, teams))

# Validation
assert all(team not in raw_data['home_team'] and team not in raw_data['away_team'] for team in filtered_teams)
assert all(team not in teams for team in filtered_teams)

['Dijon' 'Châlons-Reims' 'Boulogne-Levallois' 'Monaco' 'Chalon/Saône'
 'Cholet' 'Boulazac' 'Bourg-en-Bresse' 'Lyon-Villeurbanne' 'Roanne'
 'Le Mans' 'Pau-Lacq-Orthez' 'Limoges' 'Strasbourg' 'Nanterre' 'Le Portel'
 'Gravelines-Dunkerque' 'Orléans' 'All Star France' 'All Star Monde']


Compute team wins/loss before each game:

In [17]:
teams_WL_tmp = pd.DataFrame(data=np.zeros((len(teams), 2)), dtype=np.int64, columns=['wins', 'losses'], index=teams) # Creating a temporary dataframe to hold current team win/loss
raw_data.sort_values(by='date', axis='index', ascending=True, inplace=True) # Sorting by ascending dates
raw_data['home_team_wins'] = 0
raw_data['home_team_losses'] = 0
raw_data['away_team_wins'] = 0
raw_data['away_team_losses'] = 0

In [46]:
for index, row in raw_data.iterrows():
    raw_data.at[index, 'home_team_wins'] = teams_WL_tmp.loc[row["home_team"]]['wins']
    raw_data.at[index, 'away_team_wins'] = teams_WL_tmp.loc[row["away_team"]]['wins']
    raw_data.at[index, 'home_team_losses'] = teams_WL_tmp.loc[row["home_team"]]['losses']
    raw_data.at[index, 'away_team_losses'] = teams_WL_tmp.loc[row["away_team"]]['losses']
    
    if row['home_score'] > row['away_score']:
        teams_WL_tmp.at[row["home_team"], 'wins'] = teams_WL_tmp.at[row["home_team"], 'wins'] + 1
        teams_WL_tmp.at[row["away_team"], 'losses'] = teams_WL_tmp.at[row["away_team"], 'losses'] + 1
    elif row['home_score'] < row['away_score']:
        teams_WL_tmp.at[row["away_team"], 'wins'] = teams_WL_tmp.at[row["away_team"], 'wins'] + 1
        teams_WL_tmp.at[row["home_team"], 'losses'] = teams_WL_tmp.at[row["home_team"], 'losses'] + 1

# Validation
display(raw_data.sample(n=5))

Unnamed: 0,date,home_team,home_score,away_team,away_score,qt_1_home_score,qt_1_away_score,qt_2_home_score,qt_2_away_score,qt_3_home_score,...,away_ftm,away_blk,away_stl,away_tov,away_pf,away_pfd,home_team_wins,away_team_wins,home_team_losses,away_team_losses
158,2020-01-11,Cholet,83,Gravelines-Dunkerque,65,18,27,24,13,17,...,9,4,3,13,19,13,25.0,12.0,18.0,30.0
37,2019-10-18,Châlons-Reims,98,Nanterre,91,36,23,28,25,14,...,18,1,10,16,27,23,11.0,16.0,18.0,14.0
92,2019-11-25,Lyon-Villeurbanne,83,Châlons-Reims,76,19,24,24,21,16,...,12,2,3,14,22,24,32.0,14.0,5.0,21.0
183,2020-02-01,Pau-Lacq-Orthez,74,Strasbourg,73,11,26,18,17,26,...,18,0,3,14,24,19,18.0,18.0,27.0,27.0
210,2020-02-14,Dijon,104,Bourg-en-Bresse,79,19,17,26,15,31,...,16,4,10,14,15,21,44.0,31.0,7.0,18.0


Computation of the number of possessions. See [this article](https://fansided.com/2015/12/21/nylon-calculus-101-possessions/).