In [2]:
from torchtext import data
from scipy.sparse import csr_matrix
from scipy.sparse import vstack
import numpy as np
import random
import datetime
from math import floor
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

seed = 1

In [3]:
blurb_field = data.Field(sequential=True, use_vocab=True, lower=True, tokenize="spacy")
state_field = data.LabelField(sequential=False, use_vocab=False, preprocessing=lambda x:1 if x=='successful' else -1) #-1 for svm
dataset = data.TabularDataset(path='df_text_eng.csv',format='csv',skip_header=True,fields=[('Unnamed: 0', None),('blurb', blurb_field),('state', state_field)])

In [4]:
train, test, validation = dataset.split(random_state=random.seed(seed), split_ratio=[70,15,15])
print("Training Set Size: ", len(train))
print("Test Set Size: ", len(test))
print("Validation Set Size: ", len(validation))

Training Set Size:  150859
Test Set Size:  32327
Validation Set Size:  32327


In [5]:
#words that appear less than 3 times (2 or less) will be considered unknown words with tag "<unk>", 
#they will have the same word embedding
#vocabulary of the training set will only be used to emulate real world situtaions when the test set is unknown
blurb_field.build_vocab(train, min_freq=3) #no pads
print("Vocabulary size used: ",len(blurb_field.vocab))
word_to_ix = dict(blurb_field.vocab.stoi)

Vocabulary size used:  28760


In [6]:
def create_sparse_BOW(dataset, vocabulary_dict):
    indptr = [0]
    indices = []
    data = []
    for sentence in dataset:
        for term in sentence.blurb:
            try:
                index = word_to_ix[term]
            except:
                index = word_to_ix["<unk>"]
            indices.append(index)
            data.append(1)
        indptr.append(len(indices))
    sparse_data = csr_matrix((data, indices, indptr), dtype=int, shape=(len(dataset),len(vocabulary_dict)))
    y = [dataset[i].state for i in range(len(dataset))]
    return sparse_data,np.asarray(y)

In [7]:
def sigmoid(x):
    return (1 / (1 + np.exp(-x)))

In [8]:
def calc_accuracy(predictions, y):
    predictions = np.round(sigmoid(predictions))
    correct = float(sum(predictions == y))
    accuracy = correct/len(predictions)
    return accuracy

In [9]:
train_BOW, train_y = create_sparse_BOW(train, word_to_ix)
#we won't use cross validation to be able to compare this method with the other methods using the same training data
validation_BOW, validation_y = create_sparse_BOW(validation, word_to_ix)  
test_BOW, test_y = create_sparse_BOW(test, word_to_ix)

In [270]:
#Linear SVM
#Choose hyperparameters using Validation set
print("Linear SVM:")
init_time = datetime.datetime.now()

parameters = {'C':[np.power(2.,i) for i in range(-6,16,2)]} #2^-6, 2^-4, ..., 2^14

linear_classifier = LinearSVC(dual=False,random_state=seed)

clf = GridSearchCV(linear_classifier, parameters, verbose=2)
clf.fit(validation_BOW,validation_y)

linear_classifier = clf.best_estimator_
validation_predictions = linear_classifier.predict(validation_BOW)
validation_accuracy = calc_accuracy(validation_predictions,validation_y)

current_time = datetime.datetime.now()
total_time = (current_time-init_time).total_seconds()

print(f'Final Validation Accuracy: {validation_accuracy*100:.2f}%, Chosen C: {linear_classifier.C}')
print(f'Validation Time: {floor(total_time/3600)} hours, {floor(total_time/60)%60} minutes, {total_time%60:.2f} seconds')

Linear SVM:
Fitting 3 folds for each of 11 candidates, totalling 33 fits
[CV] C=0.015625 ......................................................
[CV] ....................................... C=0.015625, total=   0.8s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] C=0.015625 ......................................................
[CV] ....................................... C=0.015625, total=   0.9s
[CV] C=0.015625 ......................................................
[CV] ....................................... C=0.015625, total=   0.8s
[CV] C=0.0625 ........................................................
[CV] ......................................... C=0.0625, total=   1.4s
[CV] C=0.0625 ........................................................
[CV] ......................................... C=0.0625, total=   1.3s
[CV] C=0.0625 ........................................................
[CV] ......................................... C=0.0625, total=   1.4s
[CV] C=0.25 ..........................................................
[CV] ........................................... C=0.25, total=   1.8s
[CV] C=0.25 ..........................................................
[CV] ........................................... C=0.25, total=   1.7s
[CV] C

[Parallel(n_jobs=1)]: Done  33 out of  33 | elapsed: 14.9min finished


Final Validation Accuracy: 39.21%, Chosen C: 0.015625
Validation Time: 0 hours, 14 minutes, 55.43 seconds


In [10]:
linear_classifier = LinearSVC(dual=False,random_state=seed, C=0.015625)

In [11]:
#Linear SVM
#Train using chosen classifier
print("Linear SVM:")
init_time = datetime.datetime.now()

linear_classifier.fit(train_BOW,train_y)

train_predictions = linear_classifier.predict(train_BOW)
train_accuracy = calc_accuracy(train_predictions,train_y)

test_predictions = linear_classifier.predict(test_BOW)
test_accuracy = calc_accuracy(test_predictions,test_y)

current_time = datetime.datetime.now()
total_time = (current_time-init_time).total_seconds()

print(f'Train Accuracy: {train_accuracy*100:.2f}%, Test Accuracy: {test_accuracy*100:.2f}%')
print(f'Training Time: {floor(total_time/3600)} hours, {floor(total_time/60)%60} minutes, {total_time%60:.2f} seconds')

Linear SVM:
Train Accuracy: 37.11%, Test Accuracy: 34.80%
Training Time: 0 hours, 0 minutes, 8.90 seconds


In [None]:
#RBF SVM
#Choose hyperparameters using Validation set
print("RBF SVM:")
init_time = datetime.datetime.now()

parameters = {'C':[np.power(2.,i) for i in range(-6,16,2)], 'gamma':[np.power(2.,i) for i in range(-17,3,2)]}
rbf_classifier = SVC(random_state=seed)

clf = GridSearchCV(rbf_classifier, parameters, verbose=2)
clf.fit(validation_BOW,validation_y)

rbf_classifier = clf.best_estimator_
validation_predictions = rbf_classifier.predict(validation_BOW)
validation_accuracy = calc_accuracy(validation_predictions,validation_y)

current_time = datetime.datetime.now()
total_time = (current_time-init_time).total_seconds()

print(f'Final Validation Accuracy: {validation_accuracy*100:.2f}%, Chosen C: {classifier.C}')
print(f'Validation Time: {floor(total_time/3600)} hours, {floor(total_time/60)%60} minutes, {total_time%60:.2f} seconds')

In [13]:
#RBF SVM
#Train using chosen classifier (Default values were used this time due to time limitations)
print("RBF SVM:")
init_time = datetime.datetime.now()

rbf_classifier.fit(train_BOW,train_y)

train_predictions = rbf_classifier.predict(train_BOW)
train_accuracy = calc_accuracy(train_predictions,train_y)

test_predictions = rbf_classifier.predict(test_BOW)
test_accuracy = calc_accuracy(test_predictions,test_y)

current_time = datetime.datetime.now()
total_time = (current_time-init_time).total_seconds()

print(f'Train Accuracy: {train_accuracy*100:.2f}%, Test Accuracy: {test_accuracy*100:.2f}%')
print(f'Training Time: {floor(total_time/3600)} hours, {floor(total_time/60)%60} minutes, {total_time%60:.2f} seconds')

RBF SVM:
Train Accuracy: 36.56%, Test Accuracy: 37.06%
Training Time: 4 hours, 5 minutes, 53.02 seconds
