In [1]:
import pandas as pd
import numpy as np

In [2]:
#Load the dataset nba_attendance
nba_attendance = pd.read_csv('../../mast30034-project-1-dustintano10/data/raw/nba_attendance_2018_2019.csv')

In [3]:
# number of records for nba_attendance before preprocessing
nba_attendance.shape[0]

1278

In [4]:
#preprocess the attendance dataset by removing irrelevant columns and filtering it to New York playing at Home
nba_attendance = nba_attendance.drop(nba_attendance.columns[[6,7,10]],axis = 1)

nba_attendance = nba_attendance[nba_attendance['Home/Neutral'] == 'New York Knicks']

nba_attendance.rename(columns = {'Attend.':'Attendance'}, inplace = True)

In [5]:
# creates a win column which shows if the knicks won or not
nba_attendance['Win'] = np.where((nba_attendance['PTS.1'] > nba_attendance['PTS'] ), 'Yes', 'No')

In [6]:
# create new columns for individual month, day and year
# purpose is to create a new date column which has the same date format with the taxi dataset

nba_attendance['Start (ET)'] = nba_attendance['Start (ET)'].str[:-1]

nba_attendance['Month'] = nba_attendance['Date'].str[4:7]

nba_attendance['Day'] = nba_attendance['Date'].str[8:10]

nba_attendance['Year'] = nba_attendance['Date'].str[10:15]

nba_attendance['Day'] = nba_attendance['Day'].str.replace(' ', '')

nba_attendance['Day'] = nba_attendance['Day'].str.zfill(2)

In [7]:
# dictionary to convert month in the form of text to number format
# form would still be a string

month_to_num = {'Oct' : '10', 'Nov' : '11', 'Dec' : '12', 'Jan' : '01', 'Feb' : '02', 'Mar' : '03', 'Apr' : '04'}

time_replace = {'7:30' : '19:30', '8:00' : '20:00', '5:00' : '17:00', '1:00' : '13:00'}

nba_attendance['Month'] = nba_attendance['Month'].replace(month_to_num)

nba_attendance['Start (ET)'] = nba_attendance['Start (ET)'].replace(time_replace)

In [8]:
# combine the 3 new columns to one as a new date format year-month-day
# I didn't convert it to a date format due to problems when trying to merge it with taxi dataset
# so I converted it in the taxi preprocessing

nba_attendance = nba_attendance.assign(Date = nba_attendance.Year.astype('string') +\
                                      nba_attendance.Month.astype('string') +\
                                      nba_attendance.Day.astype('string'))

# added a margin_victory/loss attribute to see how big are the point differences in wins and losses
# negative shows the amount of points the knicks loss by basically how much points it needs to tie the score

nba_attendance['margin_victory/loss'] = nba_attendance['PTS.1'] - nba_attendance['PTS']

In [9]:
# remove even more irrelevant columns
nba_attendance = nba_attendance.drop(nba_attendance.columns[[2, 3, 4, 5, 7, 9, 10,11]], axis=1)

In [10]:
# final product after preprocessing

nba_attendance

Unnamed: 0,Date,Start (ET),Attendance,Win,margin_victory/loss
6,20181017,19:30,18249,Yes,19
27,20181020,19:30,19427,No,-2
67,20181026,19:30,19812,No,-28
89,20181029,19:30,19221,Yes,19
106,20181031,20:00,18295,No,-6
142,20181105,19:30,19812,No,-1
185,20181111,19:30,19812,No,-26
250,20181120,19:30,19812,No,-4
268,20181123,19:30,18948,Yes,5
329,20181201,17:00,19812,Yes,2


In [12]:
# final shape of dataset after preprocessing
nba_attendance.shape[0]

41

In [13]:
# export into csv
nba_attendance.to_csv('../../mast30034-project-1-dustintano10/data/curated/nba_attendance_new.csv', index = False)