In [49]:
# Import libraries for analysis
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score,accuracy_score, classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier


In [4]:
# Import dataset
dodgers = pd.read_csv(r'dodgers-2022.csv')

In [5]:
# View the first five rows of the data
dodgers.head()

Unnamed: 0,month,day,attend,day_of_week,opponent,temp,skies,day_night,cap,shirt,fireworks,bobblehead
0,APR,10,56000,Tuesday,Pirates,67,Clear,Day,NO,NO,NO,NO
1,APR,11,29729,Wednesday,Pirates,58,Cloudy,Night,NO,NO,NO,NO
2,APR,12,28328,Thursday,Pirates,57,Cloudy,Night,NO,NO,NO,NO
3,APR,13,31601,Friday,Padres,54,Cloudy,Night,NO,NO,YES,NO
4,APR,14,46549,Saturday,Padres,57,Cloudy,Night,NO,NO,NO,NO


In [55]:
plt.rcParams["figure.figsize"] = (12,9)

In [56]:
dodgers_1 = pd.get_dummies(dodgers, dtype = int)

In [57]:
dodgers_1.head()

Unnamed: 0,attend,temp,skies_Clear,skies_Cloudy,day_night_Day,day_night_Night,cap_NO,cap_YES,shirt_NO,shirt_YES,fireworks_NO,fireworks_YES,bobblehead_NO,bobblehead_YES
0,56000,67,1,0,1,0,1,0,1,0,1,0,1,0
1,29729,58,0,1,0,1,1,0,1,0,1,0,1,0
2,28328,57,0,1,0,1,1,0,1,0,1,0,1,0
3,31601,54,0,1,0,1,1,0,1,0,0,1,1,0
4,46549,57,0,1,0,1,1,0,1,0,1,0,1,0


In [6]:
# Remove unneeded features
dodgers.drop(columns=['month', 'day', 'day_of_week', 'opponent'], inplace=True)

In [7]:
# View data to ensure features are removed
dodgers.head()

Unnamed: 0,attend,temp,skies,day_night,cap,shirt,fireworks,bobblehead
0,56000,67,Clear,Day,NO,NO,NO,NO
1,29729,58,Cloudy,Night,NO,NO,NO,NO
2,28328,57,Cloudy,Night,NO,NO,NO,NO
3,31601,54,Cloudy,Night,NO,NO,YES,NO
4,46549,57,Cloudy,Night,NO,NO,NO,NO


In [8]:
# Split data into X and y variables
X = dodgers.drop(('attend'), axis=1)
y = dodgers['attend']

In [9]:
# Apply one-hot-encoding to X variable
X = pd.get_dummies(X)

In [10]:
# Convert True and False to 1 and 0
X = X.replace({True:1, False:0})

  X = X.replace({True:1, False:0})


In [11]:
# View the first five rows to ensure one-hot-encoding is properly applied
X.head()

Unnamed: 0,temp,skies_Clear,skies_Cloudy,day_night_Day,day_night_Night,cap_NO,cap_YES,shirt_NO,shirt_YES,fireworks_NO,fireworks_YES,bobblehead_NO,bobblehead_YES
0,67,1,0,1,0,1,0,1,0,1,0,1,0
1,58,0,1,0,1,1,0,1,0,1,0,1,0
2,57,0,1,0,1,1,0,1,0,1,0,1,0
3,54,0,1,0,1,1,0,1,0,0,1,1,0
4,57,0,1,0,1,1,0,1,0,1,0,1,0


<h2><center>Standard Linear Regression</center></h2>

In [12]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10)

In [13]:
# View the dimension of X train
X_train.shape

(64, 13)

In [14]:
# View the dimension of y train
y_train.shape

(64,)

In [15]:
# Create linear regression object
lm = LinearRegression()

In [16]:
# Fit the data to the linear regression
model = lm.fit(X_train, y_train)

In [17]:
# Create prediction object
y_pred = model.predict(X_test)

In [18]:
r_2 = r2_score(y_test, y_pred)
print(f"R-Squared: {r_2: .2f}")

R-Squared:  0.38


<h2><center>Linear Regression with Standard Scaler</center></h2>

In [19]:
scaler = StandardScaler()

In [20]:
X_train_S = scaler.fit_transform(X_train)

In [21]:
X_test_S = scaler.fit_transform(X_test)

In [22]:
lm_S = LinearRegression()

In [23]:
model_S= lm_S.fit(X_train_S, y_train)

In [24]:
y_pred_S = model_S.predict(X_test_S)

In [25]:
r_2_S = r2_score(y_test, y_pred_S)

In [26]:
r_2_S
print(f"R-Squared: {r_2_S: .2f}")

R-Squared:  0.37


<h2><center>Linear Regression with Min/Max Scaling</center></h2>

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=15)

In [28]:
X_train.shape

(64, 13)

In [29]:
X_test.shape

(17, 13)

In [30]:
min_max= MinMaxScaler()

In [31]:
X_train_m = min_max.fit_transform(X_train)

In [32]:
X_test_m = min_max.fit_transform(X_test)

In [33]:
lm_m = LinearRegression()

In [34]:
model_m = lm_m.fit(X_train_m, y_train)

In [35]:
y_pred_m = model_m.predict(X_test_m)

In [36]:
r_2_m = r2_score(y_test, y_pred_m)
print(f"R-Squared: {r_2_m: .2f}")

R-Squared:  0.06


<h2><center>K-Nearest-Neighbor with Min/Max Scaling</center></h2>

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=30)

In [42]:
scale= MinMaxScaler()

In [43]:
X_train_l = scale.fit_transform(X_train)

In [44]:
X_test_l = scale.fit_transform(X_test)

In [45]:
lr = LogisticRegression()

In [46]:
model_l = lr.fit(X_train_l, y_train)

In [47]:
y_pred_l = model_l.predict(X_test_l)

In [53]:
r2_l = r2_score(y_test, y_pred_l)

In [54]:
print(f"Accuracy: {r2_l: .2f}")

Accuracy:  0.19
