In [1]:
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import string
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import pickle
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report

In [2]:
#Load artifacts from file
##Pre-Processed pitchers full dataset
with open('Artifacts/pit_data.pkl', 'rb') as filehandle:
    pit = pickle.load(filehandle)
    
##Pre-Processed hitters full dataset
with open('Artifacts/hit_data.pkl', 'rb') as filehandle:
    hit = pickle.load(filehandle)
    
##Various tfidf vectorized datasets
with open('Artifacts/pit_tfidf1_features.pkl', 'rb') as filehandle:
    pit_tfidf1_features = pickle.load(filehandle)
with open('Artifacts/pit_tfidf2_features.pkl', 'rb') as filehandle:
    pit_tfidf2_features = pickle.load(filehandle)
with open('Artifacts/pit_tfidf3_features.pkl', 'rb') as filehandle:
    pit_tfidf3_features = pickle.load(filehandle)
with open('Artifacts/hit_tfidf1_features.pkl', 'rb') as filehandle:
    hit_tfidf1_features = pickle.load(filehandle)
with open('Artifacts/hit_tfidf2_features.pkl', 'rb') as filehandle:
    hit_tfidf2_features = pickle.load(filehandle)
with open('Artifacts/hit_tfidf3_features.pkl', 'rb') as filehandle:
    hit_tfidf3_features = pickle.load(filehandle)

# Pitcher Models

In [66]:
#1gram

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(pit_tfidf1_features, pit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
svm_pit_1gram_1 = LinearSVC(penalty = 'l2', loss = 'hinge', C=0.9, tol=1e-4, multi_class = 'ovr', class_weight = 'balanced', max_iter=1000, random_state=123)
svm_pit_1gram_1.fit(X_train, y_train)

#Predict on Test Set

svm_pit_1gram_1_predictions = svm_pit_1gram_1.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, svm_pit_1gram_1_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, svm_pit_1gram_1_predictions)
print(cnf_matrix)
print(classification_report(y_test, svm_pit_1gram_1_predictions))

Accuracy: 0.811
[[109   5   1]
 [ 15   8   0]
 [  4   3   3]]
              precision    recall  f1-score   support

         1.0       0.85      0.95      0.90       115
         2.0       0.50      0.35      0.41        23
         3.0       0.75      0.30      0.43        10

    accuracy                           0.81       148
   macro avg       0.70      0.53      0.58       148
weighted avg       0.79      0.81      0.79       148



In [62]:
#2gram

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(pit_tfidf2_features, pit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
svm_pit_2gram_1 = LinearSVC(penalty = 'l2', loss = 'hinge', C=1.3, tol=1e-4, multi_class = 'ovr', class_weight = 'balanced', max_iter=1000, random_state=123)
svm_pit_2gram_1.fit(X_train, y_train)

#Predict on Test Set

svm_pit_2gram_1_predictions = svm_pit_2gram_1.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, svm_pit_2gram_1_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, svm_pit_2gram_1_predictions)
print(cnf_matrix)
print(classification_report(y_test, svm_pit_2gram_1_predictions))

Accuracy: 0.791
[[106   8   1]
 [ 15   8   0]
 [  4   3   3]]
              precision    recall  f1-score   support

         1.0       0.85      0.92      0.88       115
         2.0       0.42      0.35      0.38        23
         3.0       0.75      0.30      0.43        10

    accuracy                           0.79       148
   macro avg       0.67      0.52      0.56       148
weighted avg       0.78      0.79      0.77       148



In [75]:
#3gram

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(pit_tfidf3_features, pit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
svm_pit_3gram_1 = LinearSVC(penalty = 'l2', loss = 'hinge', C=0.9, tol=1e-4, multi_class = 'ovr', class_weight = 'balanced', max_iter=1000, random_state=123)
svm_pit_3gram_1.fit(X_train, y_train)

#Predict on Test Set

svm_pit_3gram_1_predictions = svm_pit_3gram_1.predict(X_test)
0
# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, svm_pit_3gram_1_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, svm_pit_3gram_1_predictions)
print(cnf_matrix)
print(classification_report(y_test, svm_pit_3gram_1_predictions))

Accuracy: 0.791
[[108   6   1]
 [ 17   6   0]
 [  3   4   3]]
              precision    recall  f1-score   support

         1.0       0.84      0.94      0.89       115
         2.0       0.38      0.26      0.31        23
         3.0       0.75      0.30      0.43        10

    accuracy                           0.79       148
   macro avg       0.66      0.50      0.54       148
weighted avg       0.76      0.79      0.77       148



# Hitter Models

In [87]:
#1gram

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(hit_tfidf1_features, hit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
svm_hit_1gram_1 = LinearSVC(penalty = 'l2', loss = 'hinge', C=1.4, tol=1e-4, multi_class = 'ovr', class_weight = 'balanced', max_iter=1000, random_state=123)
svm_hit_1gram_1.fit(X_train, y_train)

#Predict on Test Set

svm_hit_1gram_1_predictions = svm_hit_1gram_1.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, svm_hit_1gram_1_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, svm_hit_1gram_1_predictions)
print(cnf_matrix)
print(classification_report(y_test, svm_hit_1gram_1_predictions))

Accuracy: 0.630
[[75 17  7]
 [20 22  4]
 [ 4  8  5]]
              precision    recall  f1-score   support

         1.0       0.76      0.76      0.76        99
         2.0       0.47      0.48      0.47        46
         3.0       0.31      0.29      0.30        17

    accuracy                           0.63       162
   macro avg       0.51      0.51      0.51       162
weighted avg       0.63      0.63      0.63       162



In [103]:
#2gram

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(hit_tfidf2_features, hit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
svm_hit_2gram_1 = LinearSVC(penalty = 'l2', loss = 'hinge', C=1.1, tol=1e-4, multi_class = 'ovr', class_weight = 'balanced', max_iter=1000, random_state=123)
svm_hit_2gram_1.fit(X_train, y_train)

#Predict on Test Set

svm_hit_2gram_1_predictions = svm_hit_2gram_1.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, svm_hit_2gram_1_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, svm_hit_2gram_1_predictions)
print(cnf_matrix)
print(classification_report(y_test, svm_hit_2gram_1_predictions))

Accuracy: 0.636
[[74 19  6]
 [17 25  4]
 [ 5  8  4]]
              precision    recall  f1-score   support

         1.0       0.77      0.75      0.76        99
         2.0       0.48      0.54      0.51        46
         3.0       0.29      0.24      0.26        17

    accuracy                           0.64       162
   macro avg       0.51      0.51      0.51       162
weighted avg       0.64      0.64      0.64       162



In [113]:
#3gram

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(hit_tfidf3_features, hit['FV_r'], test_size=0.25, random_state=123)

#Fit Model
svm_hit_3gram_1 = LinearSVC(penalty = 'l2', loss = 'hinge', C=1.0, tol=1e-4, multi_class = 'ovr', class_weight = 'balanced', max_iter=1000, random_state=123)
svm_hit_3gram_1.fit(X_train, y_train)

#Predict on Test Set

svm_hit_3gram_1_predictions = svm_hit_3gram_1.predict(X_test)

# evaluation metrics
print("Accuracy: %0.3f"%accuracy_score(y_test, svm_hit_3gram_1_predictions))
cnf_matrix = metrics.confusion_matrix(y_test, svm_hit_3gram_1_predictions)
print(cnf_matrix)
print(classification_report(y_test, svm_hit_3gram_1_predictions))

Accuracy: 0.648
[[76 16  7]
 [17 25  4]
 [ 5  8  4]]
              precision    recall  f1-score   support

         1.0       0.78      0.77      0.77        99
         2.0       0.51      0.54      0.53        46
         3.0       0.27      0.24      0.25        17

    accuracy                           0.65       162
   macro avg       0.52      0.52      0.52       162
weighted avg       0.65      0.65      0.65       162



In [114]:
#Save best model(s)

with open('Artifacts/svm_hit_3gram_1.pkl', 'wb') as filehandle:
    pickle.dump(svm_hit_3gram_1, filehandle)