# Data Preparation

This parts aim to prepare the data before exploring it and building a prediction model. You must execute notebook `data_retrieval.ipynb` before executing this one.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from typing import List
import random

random.seed(42)

In [2]:
games = pd.read_csv('data/game_scores.csv', parse_dates=['date'])
stats = pd.read_csv('data/games_statistics.csv')
raw_data = games.merge(stats, on='boxscore_url', how='inner')
raw_data_backup = raw_data.copy()

### Seasons

In [3]:
seasons = raw_data['season'].unique().tolist()
seasons

[2016, 2017, 2018, 2019]

### Filter All-Star game
We see the All-Star game is included, we want to discard it as it is not a regular game:

In [4]:
def get_teams(games_stats: pd.DataFrame) -> List[str]:
    """
    Given a game statistics dataframe, returns the unique list of teams.
    Take both home + away teams if the algorithm is run early in the season
    """
    return pd.concat([raw_data.home_team, raw_data.away_team], ignore_index=True).unique().tolist()

In [5]:
teams = get_teams(raw_data)
print(teams)
filtered_teams = ['All Star France', 'All Star Monde', 'Etrangers', 'Français']
raw_data = raw_data[~raw_data.home_team.isin(filtered_teams)]
teams = get_teams(raw_data)
print(teams)

# Validation
assert all(team not in raw_data['home_team'] and team not in raw_data['away_team'] for team in filtered_teams)
assert all(team not in teams for team in filtered_teams)

['Lyon-Villeurbanne', 'Le Portel', 'Paris-Levallois', 'Hyères-Toulon', 'Pau-Lacq-Orthez', 'Châlons-Reims', 'Chalon/Saône', 'Le Mans', 'Limoges', 'Antibes', 'Nancy', 'Monaco', 'Cholet', 'Nanterre', 'Dijon', 'Strasbourg', 'Gravelines-Dunkerque', 'Orléans', 'Français', 'Levallois', 'Bourg-en-Bresse', 'Boulazac', 'Fos-sur-Mer', 'Boulogne-Levallois', 'Roanne', 'All Star France', 'Etrangers', 'All Star Monde']
['Lyon-Villeurbanne', 'Le Portel', 'Paris-Levallois', 'Hyères-Toulon', 'Pau-Lacq-Orthez', 'Châlons-Reims', 'Chalon/Saône', 'Le Mans', 'Limoges', 'Antibes', 'Nancy', 'Monaco', 'Cholet', 'Nanterre', 'Dijon', 'Strasbourg', 'Gravelines-Dunkerque', 'Orléans', 'Levallois', 'Bourg-en-Bresse', 'Boulazac', 'Fos-sur-Mer', 'Boulogne-Levallois', 'Roanne']


> *`Levallois`, `Boulogne-Levallois` and `Paris-Levallois` are not duplicates. The team changed its name throughout the seasons.*

### Win/Loss ratio computation

Compute team wins/loss before each game:

In [6]:
# Initializing columns and sorting by game dates
raw_data.sort_values(by='date', axis='index', ascending=True, inplace=True) # Sorting by ascending dates
raw_data['home_team_wins'] = 0
raw_data['home_team_losses'] = 0
raw_data['away_team_wins'] = 0
raw_data['away_team_losses'] = 0

# Looping over seasons
for season in seasons:
    # Initializing empty dataframe with teams season W/L record
    teams_WL_tmp = pd.DataFrame(data=np.zeros((len(teams), 2)), dtype=np.int64, columns=['wins', 'losses'], index=teams) # Creating a temporary dataframe to hold current team win/loss
    
    for index, row in raw_data[raw_data['season'] == season].iterrows():
        # Filling pre-game W/L record
        raw_data.at[index, 'home_team_wins'] = teams_WL_tmp.loc[row["home_team"]]['wins']
        raw_data.at[index, 'away_team_wins'] = teams_WL_tmp.loc[row["away_team"]]['wins']
        raw_data.at[index, 'home_team_losses'] = teams_WL_tmp.loc[row["home_team"]]['losses']
        raw_data.at[index, 'away_team_losses'] = teams_WL_tmp.loc[row["away_team"]]['losses']

        # Assessing who won and incrementing counts
        if row['home_score'] > row['away_score']:
            teams_WL_tmp.at[row["home_team"], 'wins'] = teams_WL_tmp.at[row["home_team"], 'wins'] + 1
            teams_WL_tmp.at[row["away_team"], 'losses'] = teams_WL_tmp.at[row["away_team"], 'losses'] + 1
        elif row['home_score'] < row['away_score']:
            teams_WL_tmp.at[row["away_team"], 'wins'] = teams_WL_tmp.at[row["away_team"], 'wins'] + 1
            teams_WL_tmp.at[row["home_team"], 'losses'] = teams_WL_tmp.at[row["home_team"], 'losses'] + 1

# Computing W/L ratio
raw_data['home_wining_percentage'] = raw_data['home_team_wins'] / (raw_data['home_team_wins'] + raw_data['home_team_losses'])
raw_data['away_wining_percentage'] = raw_data['away_team_wins'] / (raw_data['away_team_wins'] + raw_data['away_team_losses'])

# Validation
display(raw_data.sample(n=5, random_state=42))

Unnamed: 0,season,date,home_team,home_score,away_team,away_score,home_qt_1_pts,away_qt_1_pts,home_qt_2_pts,away_qt_2_pts,...,away_stl,away_tov,away_pf,away_pfd,home_team_wins,home_team_losses,away_team_wins,away_team_losses,home_wining_percentage,away_wining_percentage
414,2017,2017-11-11,Nanterre,67,Gravelines-Dunkerque,89,20,19,16,23,...,5,14,22,19,6,3,3,5,0.666667,0.375
260,2016,2017-04-16,Pau-Lacq-Orthez,73,Monaco,85,18,20,23,17,...,16,18,28,20,20,9,27,3,0.689655,0.9
333,2016,2017-06-09,Strasbourg,70,Lyon-Villeurbanne,69,20,22,13,12,...,2,12,17,19,27,15,26,19,0.642857,0.577778
619,2017,2018-04-28,Chalon/Saône,73,Monaco,99,19,22,15,28,...,15,6,24,20,14,17,26,7,0.451613,0.787879
100,2016,2016-12-09,Châlons-Reims,87,Gravelines-Dunkerque,89,23,20,19,24,...,5,12,12,22,5,6,5,6,0.454545,0.454545


### Possessions & Pace

Computation of the number of possessions. We used formula $FGA + 0.44*FTA - ORB + TOV$. See notes for more in-depth information. We can then infer the pace (possessions per 40 minutes), by computing $\dfrac{40 * possessions}{minutes}$ to see how fast a team plays.  
(*Note that `minutes` column is divided by 5, as it is the total played, by all 5 players on the floor.*).

In [7]:
raw_data['home_possessions'] = round(raw_data['home_2pa'] + raw_data['home_3pa'] + 0.44 * raw_data['home_fta'] - raw_data['home_orbd'] + raw_data['home_tov'], 2)
raw_data['away_possessions'] = round(raw_data['away_2pa'] + raw_data['away_3pa'] + 0.44 * raw_data['away_fta'] - raw_data['away_orbd'] + raw_data['away_tov'], 2)
raw_data['home_pace'] = round((raw_data['home_possessions']*40)/(raw_data['minutes']/5), 2)
raw_data['away_pace'] = round((raw_data['away_possessions']*40)/(raw_data['minutes']/5), 2)

assert all(np.greater(raw_data[raw_data.minutes > 200].home_possessions, raw_data[raw_data.minutes > 200].home_pace))
assert all(np.equal(raw_data[raw_data.minutes == 200].home_pace, raw_data[raw_data.minutes == 200].home_possessions))
assert all(np.greater(raw_data[raw_data.minutes > 200].away_possessions, raw_data[raw_data.minutes > 200].away_pace))
assert all(np.equal(raw_data[raw_data.minutes == 200].away_pace, raw_data[raw_data.minutes == 200].away_possessions))

display(raw_data[['home_team', 'away_team', 'home_possessions', 'away_possessions', 'home_pace', 'away_pace']].sample(n=5, random_state=42))

Unnamed: 0,home_team,away_team,home_possessions,away_possessions,home_pace,away_pace
414,Nanterre,Gravelines-Dunkerque,72.92,72.04,72.92,72.04
260,Pau-Lacq-Orthez,Monaco,82.64,84.12,82.64,84.12
333,Strasbourg,Lyon-Villeurbanne,68.4,65.6,68.4,65.6
619,Chalon/Saône,Monaco,79.88,76.6,79.88,76.6
100,Châlons-Reims,Gravelines-Dunkerque,75.2,73.32,75.2,73.32


### Offensive Rating (ORtg), Defensive Rating (DRtg) and Net rating (NRtg)

Now that we have the pace, we can derive Offensive Rating, Defensive Rating and Net Rating. Offensive Rating is points scored per 100 possessions:  
$\dfrac{Pts * 100}{Poss}$  
  
Defensive rating is basically the opponent's offensive rating, and finally Net Rating is:  
$NRtg = ORtg - DRtg$.

In [8]:
raw_data['home_ortg'] = raw_data['home_score'] * 100 / raw_data['home_possessions']
raw_data['away_ortg'] = raw_data['away_score'] * 100 / raw_data['away_possessions']
raw_data['home_drtg'] = raw_data['away_ortg']
raw_data['away_drtg'] = raw_data['home_ortg']
raw_data['home_nrtg'] = raw_data['home_ortg'] - raw_data['home_drtg']
raw_data['away_nrtg'] = raw_data['away_ortg'] - raw_data['away_drtg']

display(raw_data[[
    'home_team', 
    'away_team', 
    'home_score', 
    'away_score', 
    'home_possessions', 
    'away_possessions',
    'home_ortg',
    'away_ortg',
    'home_drtg',
    'away_drtg',
    'home_nrtg',
    'away_nrtg',
]].sample(n=5, random_state=42))

Unnamed: 0,home_team,away_team,home_score,away_score,home_possessions,away_possessions,home_ortg,away_ortg,home_drtg,away_drtg,home_nrtg,away_nrtg
414,Nanterre,Gravelines-Dunkerque,67,89,72.92,72.04,91.881514,123.542476,123.542476,91.881514,-31.660962,31.660962
260,Pau-Lacq-Orthez,Monaco,73,85,82.64,84.12,88.334947,101.046125,101.046125,88.334947,-12.711178,12.711178
333,Strasbourg,Lyon-Villeurbanne,70,69,68.4,65.6,102.339181,105.182927,105.182927,102.339181,-2.843746,2.843746
619,Chalon/Saône,Monaco,73,99,79.88,76.6,91.387081,129.24282,129.24282,91.387081,-37.855739,37.855739
100,Châlons-Reims,Gravelines-Dunkerque,87,89,75.2,73.32,115.691489,121.385706,121.385706,115.691489,-5.694217,5.694217


### Compute average team statistics before each game

Pseudo code of the function to compute a stat for all previous game efficiently:
```
raw_data <- Sort raw_data by game date

for season in seasons:
    acc_dataframe <- Create dataframe [team=list of unique teams, sum_stats=np.Nan, nb_games_played=0]
    
    for game in raw_data:
        if first game of team:
            game[home_stat] <- NaN
        else:
            game[home_stat] <- acc_dataframe[game[home_team], sum_stats] / acc_dataframe[game[home_team], nb_games_played]

        acc_dataframe[home_team][sum_statistic] += game[home_stat]
        acc_dataframe[home_team][nb_game_played] += 1
```

In [9]:
def avg_stat(games_stats: pd.DataFrame, statistic: str):
    """
    Given a game statistics dataframe and a statistic category (rbd, blk, ast, etc.) 
    computes the average for each team (home, away) before each game.
    """
    teams = get_teams(games_stats)
    games_stats.sort_values(by='date', axis='index', ascending=True, inplace=True) # Sorting by ascending dates
    
    # Looping season
    for season in games_stats['season'].unique():
        # Dataframe with accumulated statistic + number of games played (to compute average)
        acc_dataframe = pd.DataFrame(data=np.zeros((len(teams), 2)), columns=['sum_' + statistic, 'games_played'], index=teams)
        
        # Looping games in season
        for idx, game in games_stats[games_stats['season'] == season].iterrows():
            for team in ['home', 'away']:
                # Computing average for the team
                if acc_dataframe.loc[game[team + '_team'], 'games_played'] == 0: # First game
                    games_stats.loc[idx, team + '_avg_' + statistic] = np.nan
                else: # All other games
                    games_stats.loc[idx, team + '_avg_' + statistic] = acc_dataframe.loc[game[team + '_team'], 'sum_' + statistic] / acc_dataframe.loc[game[team + '_team'], 'games_played']

                # Adding current game stats to the team accumulated stats.
                acc_dataframe.loc[game[team + '_team'], 'sum_' + statistic] += games_stats.loc[idx, team + '_' + statistic]
                acc_dataframe.loc[game[team + '_team'], 'games_played'] += 1

In [10]:
raw_data.columns

Index(['season', 'date', 'home_team', 'home_score', 'away_team', 'away_score',
       'home_qt_1_pts', 'away_qt_1_pts', 'home_qt_2_pts', 'away_qt_2_pts',
       'home_qt_3_pts', 'away_qt_3_pts', 'home_qt_4_pts', 'away_qt_4_pts',
       'home_ot_1_pts', 'away_ot_1_pts', 'home_ot_2_pts', 'away_ot_2_pts',
       'home_ot_3_pts', 'away_ot_3_pts', 'boxscore_url', 'minutes', 'home_ast',
       'home_drbd', 'home_orbd', 'home_2pa', 'home_2pm', 'home_3pa',
       'home_3pm', 'home_fta', 'home_ftm', 'home_blk', 'home_stl', 'home_tov',
       'home_pf', 'home_pfd', 'away_ast', 'away_drbd', 'away_orbd', 'away_2pa',
       'away_2pm', 'away_3pa', 'away_3pm', 'away_fta', 'away_ftm', 'away_blk',
       'away_stl', 'away_tov', 'away_pf', 'away_pfd', 'home_team_wins',
       'home_team_losses', 'away_team_wins', 'away_team_losses',
       'home_wining_percentage', 'away_wining_percentage', 'home_possessions',
       'away_possessions', 'home_pace', 'away_pace', 'home_ortg', 'away_ortg',
       'home_d

In [11]:
avg_stat(raw_data, 'score')
raw_data = raw_data.rename({'home_avg_score': 'home_avg_pts', 'away_avg_score': 'away_avg_pts'})
avg_stat(raw_data, 'qt_1_pts')
avg_stat(raw_data, 'qt_2_pts')
avg_stat(raw_data, 'qt_3_pts')
avg_stat(raw_data, 'qt_4_pts')
avg_stat(raw_data, 'ast')
avg_stat(raw_data, 'drbd')
avg_stat(raw_data, 'orbd')
avg_stat(raw_data, '2pm')
avg_stat(raw_data, '2pa')
avg_stat(raw_data, '3pm')
avg_stat(raw_data, '3pa')
avg_stat(raw_data, 'ftm')
avg_stat(raw_data, 'fta')
avg_stat(raw_data, 'blk')
avg_stat(raw_data, 'stl')
avg_stat(raw_data, 'tov')
avg_stat(raw_data, 'pf')
avg_stat(raw_data, 'pfd')
avg_stat(raw_data, 'possessions')
avg_stat(raw_data, 'pace')
avg_stat(raw_data, 'ortg')
avg_stat(raw_data, 'drtg')
avg_stat(raw_data, 'nrtg')
display(raw_data.iloc[random.randint(0, raw_data.shape[0])])

season                           2016
date              2017-04-01 00:00:00
home_team        Gravelines-Dunkerque
home_score                         72
away_team                      Monaco
                         ...         
away_avg_ortg                 115.119
home_avg_drtg                 104.129
away_avg_drtg                 98.7338
home_avg_nrtg                  3.5251
away_avg_nrtg                 16.3857
Name: 229, Length: 114, dtype: object

### Adding budgets and salary masses

In [12]:
budgets = pd.read_csv('data/budgets.csv')
budgets['season'] = budgets['season'].str[0:4].astype(int)
budgets = budgets.drop(columns=['rank', 'rank.1'])
budgets.head()

Unnamed: 0,season,team,budget,salary_mass
0,2019,Boulazac,3794000,1509000
1,2019,Bourg-en-Bresse,5354000,1702000
2,2019,Châlons-Reims,4253000,1406000
3,2019,Chalon/Saône,5452000,1736000
4,2019,Cholet,4601000,1365000


In [13]:
raw_data = pd.merge(raw_data, budgets, left_on=['home_team', 'season'], right_on=['team', 'season'], how='inner') \
    .drop(columns=['team']) \
    .rename({'budget': 'home_team_budget', 'salary_mass': 'home_team_salary_mass'}, axis='columns')

raw_data = pd.merge(raw_data, budgets, left_on=['away_team', 'season'], right_on=['team', 'season'], how='inner') \
    .drop(columns=['team']) \
    .rename({'budget': 'away_team_budget', 'salary_mass': 'away_team_salary_mass'}, axis='columns')

raw_data.sample(n=5, random_state=42)

Unnamed: 0,season,date,home_team,home_score,away_team,away_score,home_qt_1_pts,away_qt_1_pts,home_qt_2_pts,away_qt_2_pts,...,home_avg_ortg,away_avg_ortg,home_avg_drtg,away_avg_drtg,home_avg_nrtg,away_avg_nrtg,home_team_budget,home_team_salary_mass,away_team_budget,away_team_salary_mass
1133,2019,2020-02-01,Limoges,77,Gravelines-Dunkerque,71,16,18,26,19,...,109.255038,108.69986,112.215692,115.14332,-2.960654,-6.44346,7011000,1950000,5971000,2066500
425,2017,2018-02-16,Monaco,93,Bourg-en-Bresse,66,26,12,27,16,...,114.736891,116.381238,104.71049,112.945456,10.026402,3.435782,6326000,2263000,4387000,1338000
78,2016,2017-01-21,Le Mans,71,Gravelines-Dunkerque,81,14,19,19,21,...,105.395963,109.280149,104.671548,105.939385,0.724415,3.340764,6058000,1787500,5571000,1656000
803,2018,2018-11-16,Antibes,83,Pau-Lacq-Orthez,73,30,14,14,21,...,94.76573,103.997351,119.667141,101.855528,-24.901411,2.141823,3410000,1017000,5161000,1353000
590,2017,2018-04-15,Lyon-Villeurbanne,76,Strasbourg,73,18,25,23,17,...,112.670392,113.345686,110.370822,105.093592,2.299571,8.252094,8272000,2684000,7748000,2585000


### Adding classification target variable

In [14]:
raw_data['home_team_win'] = raw_data['home_score'] > raw_data['away_score']
raw_data.sample(n=5)

Unnamed: 0,season,date,home_team,home_score,away_team,away_score,home_qt_1_pts,away_qt_1_pts,home_qt_2_pts,away_qt_2_pts,...,away_avg_ortg,home_avg_drtg,away_avg_drtg,home_avg_nrtg,away_avg_nrtg,home_team_budget,home_team_salary_mass,away_team_budget,away_team_salary_mass,home_team_win
1122,2019,2019-11-02,Gravelines-Dunkerque,92,Limoges,80,23,21,13,20,...,108.794032,111.690365,117.414319,-2.925294,-8.620287,5971000,2066500,7011000,1950000,True
601,2017,2018-05-26,Dijon,64,Limoges,81,17,19,15,22,...,109.159703,109.325066,108.182578,3.658563,0.977125,3963000,1170000,6150000,1874000,False
1130,2019,2019-10-11,Roanne,81,Gravelines-Dunkerque,109,12,29,21,33,...,109.530944,105.180952,123.018156,-12.911168,-13.487213,3752000,1339500,5971000,2066500,False
611,2017,2017-10-20,Châlons-Reims,89,Limoges,79,18,27,23,20,...,107.291643,112.630282,102.947918,-8.188552,4.343725,3883000,1285000,6150000,1874000,True
1185,2019,2020-01-18,Nanterre,85,Strasbourg,70,17,21,32,22,...,112.493379,115.83174,112.835764,-2.348097,-0.342385,5065000,1758500,7624000,2362500,True


### Removing/Replacing *Null* values

We will modify overtime `NaN` values into $0$. On the other hand, as we don't have team averages on the first game of each season, we will remove those rows.

In [15]:
raw_data.columns[raw_data.isna().any()]

Index(['home_ot_1_pts', 'away_ot_1_pts', 'home_ot_2_pts', 'away_ot_2_pts',
       'home_ot_3_pts', 'away_ot_3_pts', 'home_wining_percentage',
       'away_wining_percentage', 'home_avg_score', 'away_avg_score',
       'home_avg_qt_1_pts', 'away_avg_qt_1_pts', 'home_avg_qt_2_pts',
       'away_avg_qt_2_pts', 'home_avg_qt_3_pts', 'away_avg_qt_3_pts',
       'home_avg_qt_4_pts', 'away_avg_qt_4_pts', 'home_avg_ast',
       'away_avg_ast', 'home_avg_drbd', 'away_avg_drbd', 'home_avg_orbd',
       'away_avg_orbd', 'home_avg_2pm', 'away_avg_2pm', 'home_avg_2pa',
       'away_avg_2pa', 'home_avg_3pm', 'away_avg_3pm', 'home_avg_3pa',
       'away_avg_3pa', 'home_avg_ftm', 'away_avg_ftm', 'home_avg_fta',
       'away_avg_fta', 'home_avg_blk', 'away_avg_blk', 'home_avg_stl',
       'away_avg_stl', 'home_avg_tov', 'away_avg_tov', 'home_avg_pf',
       'away_avg_pf', 'home_avg_pfd', 'away_avg_pfd', 'home_avg_possessions',
       'away_avg_possessions', 'home_avg_pace', 'away_avg_pace',
       'home

In [16]:
# Dropping all rows which either the home or away team has not an average score (first game of season for the team).
raw_data = raw_data.dropna(axis='index', subset=['home_avg_score', 'away_avg_score'])
display(raw_data.columns[raw_data.isna().any()])

# Filling remaining NaN values (overtimes) to 0
raw_data = raw_data.fillna(0)
assert len(raw_data.columns[raw_data.isna().any()]) == 0

Index(['home_ot_1_pts', 'away_ot_1_pts', 'home_ot_2_pts', 'away_ot_2_pts',
       'home_ot_3_pts', 'away_ot_3_pts'],
      dtype='object')

### Removing current game data

Our models will take data known *before* a game in input, to predict ifthe home team will win or not. Thus we can remove all statistics of the current game of the dataset.

In [17]:
list(raw_data.columns)

['season',
 'date',
 'home_team',
 'home_score',
 'away_team',
 'away_score',
 'home_qt_1_pts',
 'away_qt_1_pts',
 'home_qt_2_pts',
 'away_qt_2_pts',
 'home_qt_3_pts',
 'away_qt_3_pts',
 'home_qt_4_pts',
 'away_qt_4_pts',
 'home_ot_1_pts',
 'away_ot_1_pts',
 'home_ot_2_pts',
 'away_ot_2_pts',
 'home_ot_3_pts',
 'away_ot_3_pts',
 'boxscore_url',
 'minutes',
 'home_ast',
 'home_drbd',
 'home_orbd',
 'home_2pa',
 'home_2pm',
 'home_3pa',
 'home_3pm',
 'home_fta',
 'home_ftm',
 'home_blk',
 'home_stl',
 'home_tov',
 'home_pf',
 'home_pfd',
 'away_ast',
 'away_drbd',
 'away_orbd',
 'away_2pa',
 'away_2pm',
 'away_3pa',
 'away_3pm',
 'away_fta',
 'away_ftm',
 'away_blk',
 'away_stl',
 'away_tov',
 'away_pf',
 'away_pfd',
 'home_team_wins',
 'home_team_losses',
 'away_team_wins',
 'away_team_losses',
 'home_wining_percentage',
 'away_wining_percentage',
 'home_possessions',
 'away_possessions',
 'home_pace',
 'away_pace',
 'home_ortg',
 'away_ortg',
 'home_drtg',
 'away_drtg',
 'home_nrtg',
 

In [18]:
raw_data = raw_data.drop(columns=[
    'home_score',
    'away_score',
    'home_qt_1_pts',
    'away_qt_1_pts',
    'home_qt_2_pts',
    'away_qt_2_pts',
    'home_qt_3_pts',
    'away_qt_3_pts',
    'home_qt_4_pts',
    'away_qt_4_pts',
    'home_ot_1_pts',
    'away_ot_1_pts',
    'home_ot_2_pts',
    'away_ot_2_pts',
    'home_ot_3_pts',
    'away_ot_3_pts',
    'boxscore_url',
    'minutes',
    'home_ast',
    'home_drbd',
    'home_orbd',
    'home_2pa',
    'home_2pm',
    'home_3pa',
    'home_3pm',
    'home_fta',
    'home_ftm',
    'home_blk',
    'home_stl',
    'home_tov',
    'home_pf',
    'home_pfd',
    'away_ast',
    'away_drbd',
    'away_orbd',
    'away_2pa',
    'away_2pm',
    'away_3pa',
    'away_3pm',
    'away_fta',
    'away_ftm',
    'away_blk',
    'away_stl',
    'away_tov',
    'away_pf',
    'away_pfd',
    'home_possessions',
    'away_possessions',
    'home_pace',
    'away_pace',
    'home_ortg',
    'away_ortg',
    'home_drtg',
    'away_drtg',
    'home_nrtg',
    'away_nrtg'
])

### Saving processed data

In [19]:
raw_data.to_csv('data/processed_data.csv', index=False)