# Data Processing

In [8]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)

# For Visualization
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path  #for Windows/Linux compatibility
DATAPATH = Path(r'../data')


## games.csv

In [9]:
games = pd.read_csv(DATAPATH / "interim" /"games.csv")
games.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-12-22,22200477,Final,1610612740,1610612759,2022,1610613000.0,126.0,0.484,0.926,0.382,25.0,46.0,1610613000.0,117.0,0.478,0.815,0.321,23.0,44.0,1
1,2022-12-22,22200478,Final,1610612762,1610612764,2022,1610613000.0,120.0,0.488,0.952,0.457,16.0,40.0,1610613000.0,112.0,0.561,0.765,0.333,20.0,37.0,1
2,2022-12-21,22200466,Final,1610612739,1610612749,2022,1610613000.0,114.0,0.482,0.786,0.313,22.0,37.0,1610613000.0,106.0,0.47,0.682,0.433,20.0,46.0,1
3,2022-12-21,22200467,Final,1610612755,1610612765,2022,1610613000.0,113.0,0.441,0.909,0.297,27.0,49.0,1610613000.0,93.0,0.392,0.735,0.261,15.0,46.0,1
4,2022-12-21,22200468,Final,1610612737,1610612741,2022,1610613000.0,108.0,0.429,1.0,0.378,22.0,47.0,1610613000.0,110.0,0.5,0.773,0.292,20.0,47.0,0


In [10]:
games.shape

(27493, 21)

**Clean Data**

In [11]:
#remove preseason games (GAME_ID begins with a 1)
games = games[games['GAME_ID'] > 20000000]

#flag postseason games (GAME_ID begins with >2)
games['PLAYOFF'] = (games['GAME_ID'] >= 30000000).astype('int8')

#remove duplicates (each GAME_ID should be unique)
games = games[~games.duplicated(subset=['GAME_ID'])]

#drop unnecessary fields
drop_fields = ['GAME_STATUS_TEXT', 'TEAM_ID_home', 'TEAM_ID_away']
games = games.drop(drop_fields,axis=1)

games
    

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,PLAYOFF
0,2022-12-22,22200477,1610612740,1610612759,2022,126.0,0.484,0.926,0.382,25.0,46.0,117.0,0.478,0.815,0.321,23.0,44.0,1,0
1,2022-12-22,22200478,1610612762,1610612764,2022,120.0,0.488,0.952,0.457,16.0,40.0,112.0,0.561,0.765,0.333,20.0,37.0,1,0
2,2022-12-21,22200466,1610612739,1610612749,2022,114.0,0.482,0.786,0.313,22.0,37.0,106.0,0.470,0.682,0.433,20.0,46.0,1,0
3,2022-12-21,22200467,1610612755,1610612765,2022,113.0,0.441,0.909,0.297,27.0,49.0,93.0,0.392,0.735,0.261,15.0,46.0,1,0
4,2022-12-21,22200468,1610612737,1610612741,2022,108.0,0.429,1.000,0.378,22.0,47.0,110.0,0.500,0.773,0.292,20.0,47.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27488,2023-04-16 00:00:00,42200101,1610612749,1610612748,2022,117.0,49.500,72.700,24.400,29.0,38.0,130.0,59.500,71.400,60.000,32.0,36.0,0,1
27489,2023-04-15 00:00:00,42200131,1610612739,1610612752,2022,97.0,43.400,71.400,32.300,20.0,38.0,101.0,42.000,86.400,27.600,18.0,51.0,0,1
27490,2023-04-15 00:00:00,42200111,1610612738,1610612737,2022,112.0,47.700,83.300,39.400,25.0,58.0,99.0,38.800,81.800,17.200,23.0,45.0,1,1
27491,2023-04-15 00:00:00,42200121,1610612755,1610612751,2022,121.0,47.200,100.000,48.800,32.0,38.0,101.0,55.700,66.700,44.800,23.0,35.0,1,1


In [19]:
games['TARGET'] = games['HOME_TEAM_WINS']

games

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS,PLAYOFF,TARGET
0,2022-12-22,22200477,1610612740,1610612759,2022,126.0,0.484,0.926,0.382,25.0,46.0,117.0,0.478,0.815,0.321,23.0,44.0,1,0,1
1,2022-12-22,22200478,1610612762,1610612764,2022,120.0,0.488,0.952,0.457,16.0,40.0,112.0,0.561,0.765,0.333,20.0,37.0,1,0,1
2,2022-12-21,22200466,1610612739,1610612749,2022,114.0,0.482,0.786,0.313,22.0,37.0,106.0,0.470,0.682,0.433,20.0,46.0,1,0,1
3,2022-12-21,22200467,1610612755,1610612765,2022,113.0,0.441,0.909,0.297,27.0,49.0,93.0,0.392,0.735,0.261,15.0,46.0,1,0,1
4,2022-12-21,22200468,1610612737,1610612741,2022,108.0,0.429,1.000,0.378,22.0,47.0,110.0,0.500,0.773,0.292,20.0,47.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27488,2023-04-16 00:00:00,42200101,1610612749,1610612748,2022,117.0,49.500,72.700,24.400,29.0,38.0,130.0,59.500,71.400,60.000,32.0,36.0,0,1,0
27489,2023-04-15 00:00:00,42200131,1610612739,1610612752,2022,97.0,43.400,71.400,32.300,20.0,38.0,101.0,42.000,86.400,27.600,18.0,51.0,0,1,0
27490,2023-04-15 00:00:00,42200111,1610612738,1610612737,2022,112.0,47.700,83.300,39.400,25.0,58.0,99.0,38.800,81.800,17.200,23.0,45.0,1,1,1
27491,2023-04-15 00:00:00,42200121,1610612755,1610612751,2022,121.0,47.200,100.000,48.800,32.0,38.0,101.0,55.700,66.700,44.800,23.0,35.0,1,1,1


**Save Train Data**

In [20]:
games.to_csv(DATAPATH / "interim" /"transformed.csv",index=False)