In [None]:
# Import dependcies
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

# Create global seed
yogi = 8

In [None]:
# Read in data 
file_path = '../Data/Pitchers/cole.csv'
cole_df = pd.read_csv(file_path)

cole_df.dropna(inplace = True)

cole_df.head(10)

In [None]:
# Drop Unnamed columns
cole_df.drop(['Unnamed: 0', 'pitch_name'], axis = 1, inplace = True)

In [None]:
# Generate categorical variable list
pitch_cat = cole_df.dtypes[cole_df.dtypes == 'object'].index.tolist()
pitch_cat.remove('player_name')
pitch_cat

In [None]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse = False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(cole_df[pitch_cat]))

# Add the encoded varibale names to the DataFrame
encode_df.columns = enc.get_feature_names(pitch_cat)
encode_df.head()

In [None]:
# Merge one-hot encoded features and drop the originals
cole_df = cole_df.merge(
    encode_df,
    left_index = True,
    right_index = True
).drop(pitch_cat, 1)

cole_df.head()

In [None]:
# Drop superfluous columns 
cole_df.drop(columns = ['stand_L'], axis = 1, inplace = True)
cole_df.head()


### Compiling, Training, and Testing Data

In [None]:
# Define features set
X = cole_df.copy()
X.drop(columns = ['player_name', 'zone', 'balls', 'strikes', 'delta_run_exp', 'delta_home_win_exp'], axis = 1, inplace = True)
X.head()

In [None]:
# Define target vector
y = cole_df['zone'].values
y[:5]

In [None]:
# Split the data into training and testing sets - stratify by pitcher
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = yogi)

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler 
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Find Best Learning Rate

In [None]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
best_rate = 0
best_acc = 0

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
        n_estimators = 20,
        learning_rate = learning_rate,
        max_features = 7,
        max_depth = 4,
        random_state = yogi)
    
    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print(f'Learning Rate: {learning_rate}')

    # Score the model
    print('Accuracy Score (training): {0:.3f}'.format(
        classifier.score(
            X_train_scaled,
            y_train
        )
    ))
    print('Accuracy Score (validation): {0:.3f}'.format(
        classifier.score(
            X_test_scaled,
            y_test
        )
    ))
    print()

    if classifier.score(X_test_scaled, y_test) > best_acc:
        best_acc = classifier.score(X_test_scaled, y_test)
        best_rate = learning_rate

### Create model with best learning rate

In [None]:
# Choose a learning rate and create classifiers
classifier = GradientBoostingClassifier(
    n_estimators = 20,
    learning_rate = best_rate,
    max_features = 7,
    max_depth = 3,
    random_state = yogi
)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make predictions
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).head(20)

### Evaluate Model

In [None]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

In [None]:
# Generate classification report
print('Classification Report')
print(classification_report(y_test, predictions))