In [2]:
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import string
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import pickle
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import fasttext
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report

In [3]:
#Load artifacts from file
##Pre-Processed pitchers full dataset
with open('Artifacts/pit_data.pkl', 'rb') as filehandle:
    pit = pickle.load(filehandle)
    
##Pre-Processed hitters full dataset
with open('Artifacts/hit_data.pkl', 'rb') as filehandle:
    hit = pickle.load(filehandle)

In [4]:
#Transform label + report into format for fasttext
pit['ft_Report'] = pit.apply(lambda r: '__label__' + str(r['FV_r']) + ' ' + str(r['Report']), axis=1)
hit['ft_Report'] = hit.apply(lambda r: '__label__' + str(r['FV_r']) + ' ' + str(r['Report']), axis=1)

In [5]:
#Functions for extracting label
def left(s, amount):
    return s[:amount]

def right(s, amount):
    return s[-amount:]

In [6]:
#Resampling
pit_1 = pit[pit['FV_r'] == 1]
pit_2 = pit[pit['FV_r'] == 2]
pit_3 = pit[pit['FV_r'] == 3]
#Upsampling
pit_2_over = pit_2.sample(len(pit_1), replace=True)
pit_3_over = pit_3.sample(len(pit_1), replace=True)
pit_upsample = pd.concat([pit_1, pit_2_over, pit_3_over], axis=0)
#Downsampling
pit_1_under = pit_1.sample(len(pit_3), replace=True)
pit_2_under = pit_2.sample(len(pit_3), replace=True)
pit_downsample = pd.concat([pit_1_under, pit_2_under, pit_3], axis=0)

# Pitcher Model

In [7]:
#Model 1: 1gram, default parameters

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(pit_upsample['ft_Report'], pit_upsample['FV_r'], test_size=0.25, random_state=123)
X_train.to_csv('fasttext_pit_train.csv', index=False, header=False)

ft_pit_1gram_1 = fasttext.train_supervised(input='fasttext_pit_train.csv',
                                     label_prefix='__label__',
                                     wordNgrams=1,
                                     loss='hs')

evaluations = []
for i in X_test:
    i = i.replace('\n', '')
    poop = ft_pit_1gram_1.predict(i)
    evaluations.append(poop)
    
X_test_confusion = []
for i in X_test:
    j = left(i, 10)
    k = right(j, 1)
    X_test_confusion.append(k)
    
evaluations_confusion = []
for i in evaluations:
    j = i[0][0]
    k = left(j, 10)
    l = right(k, 1)
    evaluations_confusion.append(l)
    
model_matrix = confusion_matrix(X_test_confusion, evaluations_confusion)
print(model_matrix)

[[121   0   0]
 [109   0   0]
 [ 97   0   0]]
