In [1]:
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import string
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import pickle
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report

In [2]:
#Load artifacts from file
##Pre-Processed pitchers full dataset
with open('Artifacts/pit_data.pkl', 'rb') as filehandle:
    pit = pickle.load(filehandle)
    
##Pre-Processed hitters full dataset
with open('Artifacts/hit_data.pkl', 'rb') as filehandle:
    hit = pickle.load(filehandle)
    
##Various tfidf vectorized datasets
with open('Artifacts/pit_tfidf1_features.pkl', 'rb') as filehandle:
    pit_tfidf1_features = pickle.load(filehandle)
with open('Artifacts/pit_tfidf2_features.pkl', 'rb') as filehandle:
    pit_tfidf2_features = pickle.load(filehandle)
with open('Artifacts/pit_tfidf3_features.pkl', 'rb') as filehandle:
    pit_tfidf3_features = pickle.load(filehandle)
with open('Artifacts/hit_tfidf1_features.pkl', 'rb') as filehandle:
    hit_tfidf1_features = pickle.load(filehandle)
with open('Artifacts/hit_tfidf2_features.pkl', 'rb') as filehandle:
    hit_tfidf2_features = pickle.load(filehandle)
with open('Artifacts/hit_tfidf3_features.pkl', 'rb') as filehandle:
    hit_tfidf3_features = pickle.load(filehandle)

# Pitcher Models

In [3]:
#1gram: lbfgs solver

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(pit_tfidf1_features, pit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
logit_pit_1gram_1 = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', penalty = 'l2', class_weight = 'balanced', random_state=123)
logit_pit_1gram_1.fit(X_train, y_train)

#Predict on Test Set

logit_pit_1gram_1_predictions = logit_pit_1gram_1.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, logit_pit_1gram_1_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, logit_pit_1gram_1_predictions)
print(cnf_matrix)
print(classification_report(y_test, logit_pit_1gram_1_predictions))

Accuracy: 0.804
[[104  10   1]
 [ 12  11   0]
 [  4   2   4]]
              precision    recall  f1-score   support

         1.0       0.87      0.90      0.89       115
         2.0       0.48      0.48      0.48        23
         3.0       0.80      0.40      0.53        10

    accuracy                           0.80       148
   macro avg       0.71      0.59      0.63       148
weighted avg       0.80      0.80      0.80       148



In [4]:
#1gram: saga solver

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(pit_tfidf1_features, pit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
logit_pit_1gram_2 = LogisticRegression(multi_class = 'multinomial', solver = 'saga', penalty = 'l2', class_weight = 'balanced', max_iter = 1000, random_state=123)
logit_pit_1gram_2.fit(X_train, y_train)

#Predict on Test Set

logit_pit_1gram_2_predictions = logit_pit_1gram_2.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, logit_pit_1gram_2_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, logit_pit_1gram_2_predictions)
print(cnf_matrix)
print(classification_report(y_test, logit_pit_1gram_2_predictions))

Accuracy: 0.804
[[109   6   0]
 [ 13  10   0]
 [  4   6   0]]
              precision    recall  f1-score   support

         1.0       0.87      0.95      0.90       115
         2.0       0.45      0.43      0.44        23
         3.0       0.00      0.00      0.00        10

    accuracy                           0.80       148
   macro avg       0.44      0.46      0.45       148
weighted avg       0.74      0.80      0.77       148



  'precision', 'predicted', average, warn_for)


In [5]:
#2gram: lbfgs solver

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(pit_tfidf2_features, pit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
logit_pit_2gram_1 = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', penalty = 'l2', class_weight = 'balanced', random_state=123)
logit_pit_2gram_1.fit(X_train, y_train)

#Predict on Test Set

logit_pit_2gram_1_predictions = logit_pit_2gram_1.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, logit_pit_2gram_1_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, logit_pit_2gram_1_predictions)
print(cnf_matrix)
print(classification_report(y_test, logit_pit_2gram_1_predictions))

Accuracy: 0.811
[[104  10   1]
 [ 11  12   0]
 [  3   3   4]]
              precision    recall  f1-score   support

         1.0       0.88      0.90      0.89       115
         2.0       0.48      0.52      0.50        23
         3.0       0.80      0.40      0.53        10

    accuracy                           0.81       148
   macro avg       0.72      0.61      0.64       148
weighted avg       0.81      0.81      0.81       148



In [6]:
#2gram: saga solver

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(pit_tfidf2_features, pit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
logit_pit_2gram_2 = LogisticRegression(multi_class = 'multinomial', solver = 'saga', penalty = 'l2', class_weight = 'balanced', max_iter = 1000, random_state=123)
logit_pit_2gram_2.fit(X_train, y_train)

#Predict on Test Set

logit_pit_2gram_2_predictions = logit_pit_2gram_2.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, logit_pit_2gram_2_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, logit_pit_2gram_2_predictions)
print(cnf_matrix)
print(classification_report(y_test, logit_pit_2gram_2_predictions))

Accuracy: 0.777
[[97 17  1]
 [ 9 14  0]
 [ 3  3  4]]
              precision    recall  f1-score   support

         1.0       0.89      0.84      0.87       115
         2.0       0.41      0.61      0.49        23
         3.0       0.80      0.40      0.53        10

    accuracy                           0.78       148
   macro avg       0.70      0.62      0.63       148
weighted avg       0.81      0.78      0.79       148





In [7]:
#3gram: lbfgs solver

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(pit_tfidf3_features, pit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
logit_pit_3gram_1 = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', penalty = 'l2', class_weight = 'balanced', random_state=123)
logit_pit_3gram_1.fit(X_train, y_train)

#Predict on Test Set

logit_pit_3gram_1_predictions = logit_pit_3gram_1.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, logit_pit_3gram_1_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, logit_pit_3gram_1_predictions)
print(cnf_matrix)
print(classification_report(y_test, logit_pit_3gram_1_predictions))

Accuracy: 0.804
[[103  11   1]
 [ 11  12   0]
 [  3   3   4]]
              precision    recall  f1-score   support

         1.0       0.88      0.90      0.89       115
         2.0       0.46      0.52      0.49        23
         3.0       0.80      0.40      0.53        10

    accuracy                           0.80       148
   macro avg       0.71      0.61      0.64       148
weighted avg       0.81      0.80      0.80       148



In [8]:
#3gram: saga solver

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(pit_tfidf3_features, pit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
logit_pit_3gram_2 = LogisticRegression(multi_class = 'multinomial', solver = 'saga', penalty = 'l2', class_weight = 'balanced', max_iter = 1000, random_state=123)
logit_pit_3gram_2.fit(X_train, y_train)

#Predict on Test Set

logit_pit_3gram_2_predictions = logit_pit_3gram_2.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, logit_pit_3gram_2_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, logit_pit_3gram_2_predictions)
print(cnf_matrix)
print(classification_report(y_test, logit_pit_3gram_2_predictions))

Accuracy: 0.764
[[95 19  1]
 [ 8 14  1]
 [ 3  3  4]]
              precision    recall  f1-score   support

         1.0       0.90      0.83      0.86       115
         2.0       0.39      0.61      0.47        23
         3.0       0.67      0.40      0.50        10

    accuracy                           0.76       148
   macro avg       0.65      0.61      0.61       148
weighted avg       0.80      0.76      0.78       148





# Hitter Models

In [9]:
#1gram: lbfgs solver

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(hit_tfidf1_features, hit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
logit_hit_1gram_1 = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', penalty = 'l2', class_weight = 'balanced', random_state=123)
logit_hit_1gram_1.fit(X_train, y_train)

#Predict on Test Set

logit_hit_1gram_1_predictions = logit_hit_1gram_1.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, logit_hit_1gram_1_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, logit_hit_1gram_1_predictions)
print(cnf_matrix)
print(classification_report(y_test, logit_hit_1gram_1_predictions))

Accuracy: 0.617
[[70 21  8]
 [17 26  3]
 [ 3 10  4]]
              precision    recall  f1-score   support

         1.0       0.78      0.71      0.74        99
         2.0       0.46      0.57      0.50        46
         3.0       0.27      0.24      0.25        17

    accuracy                           0.62       162
   macro avg       0.50      0.50      0.50       162
weighted avg       0.63      0.62      0.62       162



In [10]:
#1gram: saga solver

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(hit_tfidf1_features, hit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
logit_hit_1gram_2 = LogisticRegression(multi_class = 'multinomial', solver = 'saga', penalty = 'l2', class_weight = 'balanced', max_iter = 1000, random_state=123)
logit_hit_1gram_2.fit(X_train, y_train)

#Predict on Test Set

logit_hit_1gram_2_predictions = logit_hit_1gram_2.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, logit_hit_1gram_2_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, logit_hit_1gram_2_predictions)
print(cnf_matrix)
print(classification_report(y_test, logit_hit_1gram_2_predictions))

Accuracy: 0.617
[[72 27  0]
 [18 28  0]
 [ 4 13  0]]
              precision    recall  f1-score   support

         1.0       0.77      0.73      0.75        99
         2.0       0.41      0.61      0.49        46
         3.0       0.00      0.00      0.00        17

    accuracy                           0.62       162
   macro avg       0.39      0.45      0.41       162
weighted avg       0.59      0.62      0.60       162



  'precision', 'predicted', average, warn_for)


In [11]:
#2gram: lbfgs solver

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(hit_tfidf2_features, hit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
logit_hit_2gram_1 = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', penalty = 'l2', class_weight = 'balanced', random_state=123)
logit_hit_2gram_1.fit(X_train, y_train)

#Predict on Test Set

logit_hit_2gram_1_predictions = logit_hit_2gram_1.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, logit_hit_2gram_1_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, logit_hit_2gram_1_predictions)
print(cnf_matrix)
print(classification_report(y_test, logit_hit_2gram_1_predictions))

Accuracy: 0.630
[[70 24  5]
 [16 28  2]
 [ 2 11  4]]
              precision    recall  f1-score   support

         1.0       0.80      0.71      0.75        99
         2.0       0.44      0.61      0.51        46
         3.0       0.36      0.24      0.29        17

    accuracy                           0.63       162
   macro avg       0.53      0.52      0.52       162
weighted avg       0.65      0.63      0.63       162



In [12]:
#2gram: saga solver

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(hit_tfidf2_features, hit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
logit_hit_2gram_2 = LogisticRegression(multi_class = 'multinomial', solver = 'saga', penalty = 'l2', class_weight = 'balanced', max_iter = 1000, random_state=123)
logit_hit_2gram_2.fit(X_train, y_train)

#Predict on Test Set

logit_hit_2gram_2_predictions = logit_hit_2gram_2.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, logit_hit_2gram_2_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, logit_hit_2gram_2_predictions)
print(cnf_matrix)
print(classification_report(y_test, logit_hit_2gram_2_predictions))

Accuracy: 0.617
[[70 26  3]
 [17 29  0]
 [ 5 11  1]]
              precision    recall  f1-score   support

         1.0       0.76      0.71      0.73        99
         2.0       0.44      0.63      0.52        46
         3.0       0.25      0.06      0.10        17

    accuracy                           0.62       162
   macro avg       0.48      0.47      0.45       162
weighted avg       0.62      0.62      0.60       162





In [13]:
#3gram: lbfgs solver

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(hit_tfidf3_features, hit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
logit_hit_3gram_1 = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', penalty = 'l2', class_weight = 'balanced', random_state=123)
logit_hit_3gram_1.fit(X_train, y_train)

#Predict on Test Set

logit_hit_3gram_1_predictions = logit_hit_3gram_1.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, logit_hit_3gram_1_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, logit_hit_3gram_1_predictions)
print(cnf_matrix)
print(classification_report(y_test, logit_hit_3gram_1_predictions))

Accuracy: 0.636
[[70 24  5]
 [15 29  2]
 [ 2 11  4]]
              precision    recall  f1-score   support

         1.0       0.80      0.71      0.75        99
         2.0       0.45      0.63      0.53        46
         3.0       0.36      0.24      0.29        17

    accuracy                           0.64       162
   macro avg       0.54      0.52      0.52       162
weighted avg       0.66      0.64      0.64       162



In [14]:
#3gram: saga solver

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(hit_tfidf3_features, hit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
logit_hit_3gram_2 = LogisticRegression(multi_class = 'multinomial', solver = 'saga', penalty = 'l2', class_weight = 'balanced', max_iter = 1000, random_state=123)
logit_hit_3gram_2.fit(X_train, y_train)

#Predict on Test Set

logit_hit_3gram_2_predictions = logit_hit_3gram_2.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, logit_hit_3gram_2_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, logit_hit_3gram_2_predictions)
print(cnf_matrix)
print(classification_report(y_test, logit_hit_3gram_2_predictions))

Accuracy: 0.636
[[73 25  1]
 [16 30  0]
 [ 4 13  0]]
              precision    recall  f1-score   support

         1.0       0.78      0.74      0.76        99
         2.0       0.44      0.65      0.53        46
         3.0       0.00      0.00      0.00        17

    accuracy                           0.64       162
   macro avg       0.41      0.46      0.43       162
weighted avg       0.60      0.64      0.61       162





In [30]:
#Save best model(s)

with open('Artifacts/logit_pit_2gram_1.pkl', 'wb') as filehandle:
    pickle.dump(logit_pit_2gram_1, filehandle)