## Data Prep 

Import Needed libraries

In [208]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
np.random.seed(42)

In [209]:
#Import dataset
baseball = pd.read_csv("baseball.csv")


In [210]:
baseball.head()

Unnamed: 0,attendance_binary,previous_attendance,previous_away_team_errors,previous_away_team_hits,previous_away_team_runs,game_type,previous_game_type,previous_home_team_errors,previous_home_team_hits,previous_home_team_runs,game_day,previous_game_day,temperature,wind_speed,sky,previous_game_duration,previous_homewin
0,0,43683,2,6,2,Night Game,Day Game,0,6,6,Wednesday,Monday,55,24,Overcast,2.933333,1
1,0,45785,0,7,2,Night Game,Day Game,0,10,3,Wednesday,Monday,48,7,Unknown,2.8,1
2,0,48282,0,8,4,Night Game,Day Game,2,4,3,Wednesday,Monday,65,10,Cloudy,3.383333,0
3,0,21830,0,9,6,Day Game,Night Game,0,15,11,Wednesday,Tuesday,77,0,In Dome,3.233333,1
4,0,49289,2,4,2,Night Game,Day Game,1,1,3,Tuesday,Monday,81,12,Cloudy,2.633333,1


## Splitting Data into Train and Test

In [211]:
train_set, test_set = train_test_split(baseball, test_size=0.3)

In [212]:
#Dropping usless columns, since this column is not useful for binary classification
train = train_set
test = test_set

## Checking our dataset for missing values

In [213]:
train_set.isna().sum()

attendance_binary            0
previous_attendance          0
previous_away_team_errors    0
previous_away_team_hits      0
previous_away_team_runs      0
game_type                    0
previous_game_type           0
previous_home_team_errors    0
previous_home_team_hits      0
previous_home_team_runs      0
game_day                     0
previous_game_day            0
temperature                  0
wind_speed                   0
sky                          0
previous_game_duration       0
previous_homewin             0
dtype: int64

In [214]:
test_set.isna().sum()

attendance_binary            0
previous_attendance          0
previous_away_team_errors    0
previous_away_team_hits      0
previous_away_team_runs      0
game_type                    0
previous_game_type           0
previous_home_team_errors    0
previous_home_team_hits      0
previous_home_team_runs      0
game_day                     0
previous_game_day            0
temperature                  0
wind_speed                   0
sky                          0
previous_game_duration       0
previous_homewin             0
dtype: int64

#### Since we have no missing values we do not need to performe any cleaning process

# Data Preperation

In [215]:
# Importing needed libraries
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Seperating target variable to prevent any transformation on it

In [216]:
train_y = train[['attendance_binary']]
test_y = test[['attendance_binary']]

train_inputs = train.drop(['attendance_binary'], axis=1)
test_inputs = test.drop(['attendance_binary'], axis=1)

### Identifying and Seperating Numeric and Categorical and Binary columns

In [217]:
train_inputs.dtypes

previous_attendance            int64
previous_away_team_errors      int64
previous_away_team_hits        int64
previous_away_team_runs        int64
game_type                     object
previous_game_type            object
previous_home_team_errors      int64
previous_home_team_hits        int64
previous_home_team_runs        int64
game_day                      object
previous_game_day             object
temperature                    int64
wind_speed                     int64
sky                           object
previous_game_duration       float64
previous_homewin               int64
dtype: object

In [218]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

# Identify the binary columns so we can pass them through without transforming
binary_columns = ['previous_homewin']

In [219]:
numeric_columns

['previous_attendance',
 'previous_away_team_errors',
 'previous_away_team_hits',
 'previous_away_team_runs',
 'previous_home_team_errors',
 'previous_home_team_hits',
 'previous_home_team_runs',
 'temperature',
 'wind_speed',
 'previous_game_duration',
 'previous_homewin']

In [220]:
categorical_columns

['game_type', 'previous_game_type', 'game_day', 'previous_game_day', 'sky']

In [221]:
binary_columns

['previous_homewin']

In [222]:
# Removing binary columns from numeric columns
for col in binary_columns:
    numeric_columns.remove(col)

In [223]:
numeric_columns

['previous_attendance',
 'previous_away_team_errors',
 'previous_away_team_hits',
 'previous_away_team_runs',
 'previous_home_team_errors',
 'previous_home_team_hits',
 'previous_home_team_runs',
 'temperature',
 'wind_speed',
 'previous_game_duration']

# Piplining

In [224]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [225]:
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [226]:
# Creating our Preprocessor
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)],
        remainder='passthrough')

# Transform

### Fit_Transform() for Train set

In [227]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[-1.12371621,  0.57666325, -0.20719118, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.05787587, -0.72716391, -0.78017264, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.38394295,  0.57666325, -1.35315411, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.52534761,  0.57666325, -0.78017264, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.94392488, -0.72716391,  0.65228102, ...,  0.        ,
         0.        ,  0.        ],
       [-0.98407336,  1.88049041, -0.49368191, ...,  1.        ,
         0.        ,  1.        ]])

In [228]:
train_x.shape

(1698, 37)

### Transform() For Test set

In [229]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[ 0.0814842 ,  1.88049041, -1.06666338, ...,  1.        ,
         0.        ,  1.        ],
       [ 1.17001331, -0.72716391, -1.63964484, ...,  0.        ,
         0.        ,  0.        ],
       [-0.29905768, -0.72716391, -0.20719118, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.49582715, -0.72716391,  1.22526249, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.26445059, -0.72716391, -0.20719118, ...,  0.        ,
         0.        ,  1.        ],
       [-1.71775245, -0.72716391, -1.63964484, ...,  0.        ,
         0.        ,  0.        ]])

In [230]:
test_x.shape

(729, 37)

#### Number of Columns in both Train and Test are equal which means we can start building our models

## Find the Baseline

### First we need to find Base Line Accuracy so we can compare the accuracy of our model against baseline

In [231]:
# Find majority class
train_y.value_counts()

attendance_binary
1                    873
0                    825
dtype: int64

#### The majority is 1

In [232]:
# Find percentage
train_y.value_counts()/len(train_y)

attendance_binary
1                    0.514134
0                    0.485866
dtype: float64

#### According to our  result the base line is 52%. So any model that does not perform better than 52% is not a valueable model

# Section 2: (3 points in total)

Build three different SVM models (by changing the kernels, regularization, etc.). Generate their training and test values. Each model is worth 1 point. 

(Add cells as needed)

## SVM Model 1: linearSVC

In [233]:
# Importing SVM from sklearn
from sklearn.svm import LinearSVC 

In [388]:
# Creating our model
svm_linear = LinearSVC(C=1)

svm_linear.fit(train_x, train_y)

  return f(*args, **kwargs)


LinearSVC(C=1)

### Calculating the accuracy of our model

In [389]:
# Importing accuracy score from sklearn
from sklearn.metrics import accuracy_score

In [392]:
#Predict the train values
train_y_pred = svm_linear.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8439340400471143

In [394]:
#Predict the test values
test_y_pred = svm_linear.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8148148148148148

## Confusion Matrix

In [242]:
# We can use Confusion matrix to check the accuracy of our model further
from sklearn.metrics import confusion_matrix

#We usually create the confusion matrix on test set
confusion_matrix(test_y, test_y_pred)

array([[273,  70],
       [ 66, 320]], dtype=int64)

## Classification Report

In [243]:
from sklearn.metrics import classification_report

#We usually create the classification report on test set
print(classification_report(test_y, test_y_pred))

              precision    recall  f1-score   support

           0       0.81      0.80      0.80       343
           1       0.82      0.83      0.82       386

    accuracy                           0.81       729
   macro avg       0.81      0.81      0.81       729
weighted avg       0.81      0.81      0.81       729



#### Our model performs better than baseline and there is no overfitting. however there might be underfitting, we can check that by creating more models

## SVM Model 2: Linear SVC with Polynomial Terms

In [244]:
from sklearn.preprocessing import PolynomialFeatures

# Create second degree terms
poly_features = PolynomialFeatures(degree=1, include_bias=False)

train_x_poly = poly_features.fit_transform(train_x)

#Don't forget to transform the test set
test_x_poly = poly_features.transform(test_x)


In [245]:
pol_svm = LinearSVC(C=1)

pol_svm.fit(train_x_poly, train_y)

  return f(*args, **kwargs)


LinearSVC(C=1)

In [395]:
#Predict the train values
train_y_poly_pred = pol_svm.predict(train_x_poly)

#Train accuracy
accuracy_score(train_y, train_y_poly_pred)

0.8439340400471143

In [396]:
#Predict the test values
test_y_poly_pred = pol_svm.predict(test_x_poly)

#Test accuracy
accuracy_score(test_y, test_y_poly_pred)

0.8148148148148148

#### Eventhough our polynomial model performs better (99%) with degree of 3 on train set but we will have overfitting since our test set accuracy will be 76%. So by reducing degree to 1 and c value to 1, we have a better accuracy on test set (81%). By doing so we will deal with overfitting issue

## SVM Model 3: SVC(kernel='linear')


In [250]:
from sklearn.svm import SVC
 
lin_svm2 = SVC(kernel="linear")

lin_svm2.fit(train_x, train_y)

  return f(*args, **kwargs)


SVC(kernel='linear')

In [397]:
#Predict the train values
train_y_pred = lin_svm2.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8445229681978799

In [398]:
#Predict the test values
test_y_pred = lin_svm2.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8175582990397805

#### Our model does not perform better that linear one

## SVM Model 4: SVC(kernel='rbf')


In [377]:
rbf_svm = SVC(kernel="rbf", C=10, gamma='scale')

rbf_svm.fit(train_x, train_y)

  return f(*args, **kwargs)


SVC(C=10)

In [400]:
#Predict the train values
train_y_pred = rbf_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.9705535924617197

In [401]:
#Predict the test values
test_y_pred = rbf_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.7997256515775034

#### In rbf model with C value of 10 we face overfitting issue since our train set accuracy is 97% but the test accuracy is only 79%. We can reduce C to 1 which will give us 88% accuracy on train and 82% accuracy on test set. the amount of overfitting drastically reduced.

# Section 3: (3 points in total)

Build two different SGD models (by changing the penalty, etc. or adding polynomial terms) and one LogisticRregression model. Generate their training and test values. Each model is worth 1 point.

(Add cells as needed)

## SGD Model 1:

In [418]:
from sklearn.linear_model import SGDClassifier 

# tol = stopping criterion
# eta0 = learning rate
# penalty = regularization term
# max_iter = number of passes over training data (i.e., epochs)

sgd_logreg = SGDClassifier(random_state=1 ,max_iter=1000, penalty=None, eta0=0.0001, tol=0.0001) 

sgd_logreg.fit(train_x, train_y)

  return f(*args, **kwargs)


SGDClassifier(eta0=0.0001, penalty=None, random_state=1, tol=0.0001)

In [419]:
#Predict the train values
train_y_pred = sgd_logreg.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)


0.8286219081272085

In [420]:
#Predict the test values
test_y_pred = sgd_logreg.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.803840877914952

#### Our SGD model accuracy is 81% for train and 79% for test which is not better than our previous models so eventhough this model is performing better than baseline and there is no overfitting but we have underfitting.

## SGD Model 2: penalty='elasticnet'

In [374]:
from sklearn.linear_model import SGDClassifier 

# tol = stopping criterion
# eta0 = learning rate
# penalty = regularization term
# max_iter = number of passes over training data (i.e., epochs)

sgd_logreg2 = SGDClassifier(max_iter=1000, penalty='elasticnet', eta0=0.01, tol=0.00001) 

sgd_logreg2.fit(train_x, train_y)

  return f(*args, **kwargs)


SGDClassifier(eta0=0.01, penalty='elasticnet', tol=1e-05)

In [375]:
#Predict the train values
train_y_pred = sgd_logreg2.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)


0.8244994110718492

In [376]:
#Predict the test values
test_y_pred = sgd_logreg2.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.7846364883401921

#### this models did not perform better than previus ones.

## LogisticRegression Model:

In [367]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(penalty='none')

log_reg.fit(train_x, train_y)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(penalty='none')

In [368]:
#Predict the train values
train_y_pred = log_reg.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8468786808009423

In [369]:
#Predict the test values
test_y_pred = log_reg.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8161865569272977

#### Eventhough there is a slight overfitting problem but the logistic regressionmodel did not performe better.

# Discussion (3 points in total)


## List the train and test values of each model you built (1 point)

## Which model performs the best and why? (0.5 points) How does it compare to baseline? (0.5 points)

Hint: The best model is the one that has the highest TEST score (regardless of any of the training values). If you select your model based on TRAIN values, you will lose points.

## Is there any evidence of overfitting in the best model, why or why not? If there is, what did you do about it? (0.5 points)

## Is there any evidence of overfitting in the other models (besides the best model), why or why not? If there is, what did you do about it? (0.5 points)