In [1]:
import pandas as pd
import numpy as np

In [2]:
#Load the dataset nba_attendance
nba_attendance = pd.read_csv('../../mast30034-project-1-dustintano10/data/raw/nba_attendance_2018_2019.csv')

In [3]:
#preprocess the attendance dataset by removing irrelevant columns and filtering it to New York playing at Home
nba_attendance = nba_attendance.drop(nba_attendance.columns[[6,7,10]],axis = 1)

nba_attendance = nba_attendance[nba_attendance['Home/Neutral'] == 'New York Knicks']

nba_attendance.rename(columns = {'Attend.':'Attendance'}, inplace = True)

In [4]:
# creates a win column which shows if the knicks won or not
nba_attendance['Win'] = np.where((nba_attendance['PTS.1'] > nba_attendance['PTS'] ), 'Yes', 'No')

nba_attendance

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Attendance,Arena,Win
6,Wed Oct 17 2018,7:30p,Atlanta Hawks,107,New York Knicks,126,18249,Madison Square Garden (IV),Yes
27,Sat Oct 20 2018,7:30p,Boston Celtics,103,New York Knicks,101,19427,Madison Square Garden (IV),No
67,Fri Oct 26 2018,7:30p,Golden State Warriors,128,New York Knicks,100,19812,Madison Square Garden (IV),No
89,Mon Oct 29 2018,7:30p,Brooklyn Nets,96,New York Knicks,115,19221,Madison Square Garden (IV),Yes
106,Wed Oct 31 2018,8:00p,Indiana Pacers,107,New York Knicks,101,18295,Madison Square Garden (IV),No
142,Mon Nov 5 2018,7:30p,Chicago Bulls,116,New York Knicks,115,19812,Madison Square Garden (IV),No
185,Sun Nov 11 2018,7:30p,Orlando Magic,115,New York Knicks,89,19812,Madison Square Garden (IV),No
250,Tue Nov 20 2018,7:30p,Portland Trail Blazers,118,New York Knicks,114,19812,Madison Square Garden (IV),No
268,Fri Nov 23 2018,7:30p,New Orleans Pelicans,109,New York Knicks,114,18948,Madison Square Garden (IV),Yes
329,Sat Dec 1 2018,5:00p,Milwaukee Bucks,134,New York Knicks,136,19812,Madison Square Garden (IV),Yes


In [5]:
# create new columns for individual month, day and year
# purpose is to create a new date column which has the same date format with the taxi dataset

nba_attendance['Start (ET)'] = nba_attendance['Start (ET)'].str[:-1]

nba_attendance['Month'] = nba_attendance['Date'].str[4:7]

nba_attendance['Day'] = nba_attendance['Date'].str[8:10]

nba_attendance['Year'] = nba_attendance['Date'].str[10:15]

nba_attendance['Day'] = nba_attendance['Day'].str.replace(' ', '')

nba_attendance['Day'] = nba_attendance['Day'].str.zfill(2)

In [6]:
# dictionary to convert month in the form of text to number format
# form would still be a string

month_to_num = {'Oct' : '10', 'Nov' : '11', 'Dec' : '12', 'Jan' : '01', 'Feb' : '02', 'Mar' : '03', 'Apr' : '04'}

time_replace = {'7:30' : '19:30', '8:00' : '20:00', '5:00' : '17:00', '1:00' : '13:00'}

nba_attendance['Month'] = nba_attendance['Month'].replace(month_to_num)

nba_attendance['Start (ET)'] = nba_attendance['Start (ET)'].replace(time_replace)
nba_attendance

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Attendance,Arena,Win,Month,Day,Year
6,Wed Oct 17 2018,19:30,Atlanta Hawks,107,New York Knicks,126,18249,Madison Square Garden (IV),Yes,10,17,2018
27,Sat Oct 20 2018,19:30,Boston Celtics,103,New York Knicks,101,19427,Madison Square Garden (IV),No,10,20,2018
67,Fri Oct 26 2018,19:30,Golden State Warriors,128,New York Knicks,100,19812,Madison Square Garden (IV),No,10,26,2018
89,Mon Oct 29 2018,19:30,Brooklyn Nets,96,New York Knicks,115,19221,Madison Square Garden (IV),Yes,10,29,2018
106,Wed Oct 31 2018,20:00,Indiana Pacers,107,New York Knicks,101,18295,Madison Square Garden (IV),No,10,31,2018
142,Mon Nov 5 2018,19:30,Chicago Bulls,116,New York Knicks,115,19812,Madison Square Garden (IV),No,11,5,2018
185,Sun Nov 11 2018,19:30,Orlando Magic,115,New York Knicks,89,19812,Madison Square Garden (IV),No,11,11,2018
250,Tue Nov 20 2018,19:30,Portland Trail Blazers,118,New York Knicks,114,19812,Madison Square Garden (IV),No,11,20,2018
268,Fri Nov 23 2018,19:30,New Orleans Pelicans,109,New York Knicks,114,18948,Madison Square Garden (IV),Yes,11,23,2018
329,Sat Dec 1 2018,17:00,Milwaukee Bucks,134,New York Knicks,136,19812,Madison Square Garden (IV),Yes,12,1,2018


In [7]:
# combine the 3 new columns to one as a new date format year-month-day
# I didn't convert it to a date format due to problems when trying to merge it with taxi dataset
# so I converted it in the taxi preprocessing

nba_attendance = nba_attendance.assign(Date = nba_attendance.Year.astype('string') +\
                                      nba_attendance.Month.astype('string') +\
                                      nba_attendance.Day.astype('string'))

# added a margin_victory/loss attribute to see how big are the point differences in wins and losses
# negative shows the amount of points the knicks loss by basically how much points it needs to tie the score

nba_attendance['margin_victory/loss'] = nba_attendance['PTS.1'] - nba_attendance['PTS']

nba_attendance

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Attendance,Arena,Win,Month,Day,Year,margin_victory/loss
6,20181017,19:30,Atlanta Hawks,107,New York Knicks,126,18249,Madison Square Garden (IV),Yes,10,17,2018,19
27,20181020,19:30,Boston Celtics,103,New York Knicks,101,19427,Madison Square Garden (IV),No,10,20,2018,-2
67,20181026,19:30,Golden State Warriors,128,New York Knicks,100,19812,Madison Square Garden (IV),No,10,26,2018,-28
89,20181029,19:30,Brooklyn Nets,96,New York Knicks,115,19221,Madison Square Garden (IV),Yes,10,29,2018,19
106,20181031,20:00,Indiana Pacers,107,New York Knicks,101,18295,Madison Square Garden (IV),No,10,31,2018,-6
142,20181105,19:30,Chicago Bulls,116,New York Knicks,115,19812,Madison Square Garden (IV),No,11,5,2018,-1
185,20181111,19:30,Orlando Magic,115,New York Knicks,89,19812,Madison Square Garden (IV),No,11,11,2018,-26
250,20181120,19:30,Portland Trail Blazers,118,New York Knicks,114,19812,Madison Square Garden (IV),No,11,20,2018,-4
268,20181123,19:30,New Orleans Pelicans,109,New York Knicks,114,18948,Madison Square Garden (IV),Yes,11,23,2018,5
329,20181201,17:00,Milwaukee Bucks,134,New York Knicks,136,19812,Madison Square Garden (IV),Yes,12,1,2018,2


In [8]:
# remove even more irrelevant columns as we only need these columns

nba_attendance = nba_attendance.drop(nba_attendance.columns[[2, 3, 4, 5, 7, 9, 10,11]], axis=1)

nba_attendance

Unnamed: 0,Date,Start (ET),Attendance,Win,margin_victory/loss
6,20181017,19:30,18249,Yes,19
27,20181020,19:30,19427,No,-2
67,20181026,19:30,19812,No,-28
89,20181029,19:30,19221,Yes,19
106,20181031,20:00,18295,No,-6
142,20181105,19:30,19812,No,-1
185,20181111,19:30,19812,No,-26
250,20181120,19:30,19812,No,-4
268,20181123,19:30,18948,Yes,5
329,20181201,17:00,19812,Yes,2


In [9]:
nba_attendance.to_csv('../../mast30034-project-1-dustintano10/data/curated/nba_attendance_new.csv', index = False)
nba_attendance_2 = pd.read_csv('../../mast30034-project-1-dustintano10/data/curated/nba_attendance_new.csv')
nba_attendance_2

Unnamed: 0,Date,Start (ET),Attendance,Win,margin_victory/loss
0,20181017,19:30,18249,Yes,19
1,20181020,19:30,19427,No,-2
2,20181026,19:30,19812,No,-28
3,20181029,19:30,19221,Yes,19
4,20181031,20:00,18295,No,-6
5,20181105,19:30,19812,No,-1
6,20181111,19:30,19812,No,-26
7,20181120,19:30,19812,No,-4
8,20181123,19:30,18948,Yes,5
9,20181201,17:00,19812,Yes,2
