## Linear SVM

In [1]:
import helpers as helpers
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/griffinmichalak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data, labels = helpers.get_data()
TDT_SPLIT = "80/10/10"
train_text, dev_text, test_text = helpers.split(data=data, dist=TDT_SPLIT)
train_dep, dev_dep, test_dep = helpers.split(data=labels, dist=TDT_SPLIT)

model = CountVectorizer(max_features=50_000, ngram_range=(1,2), min_df=2)
model.fit(train_text)
train_vectors = model.transform(train_text)
dev_vectors = model.transform(dev_text)
test_vectors = model.transform(test_text)

# SVC expects normalized data, but it has a negligible effect
scaler = Normalizer(norm='l2')
scaler.fit(train_vectors)
train_vectors = scaler.transform(train_vectors)
dev_vectors = scaler.transform(dev_vectors)
test_vectors = scaler.transform(test_vectors)

Completing 80/10/10 split
Completing 80/10/10 split


In [3]:
# print(train_vectors.shape)
# total = 0
# for row in train_vectors:
#     total += sum(row.toarray())
# print(sum(total))

In [4]:
estimator = LinearSVC()

param_grid = {
    'penalty': ['l1', 'l2'],
    'tol': [.000005, .00001, .00005, .0001],
    'C': [0.1, 1, 10, 100]
}

grid_search = GridSearchCV(estimator, param_grid, scoring='f1', n_jobs=-1)


In [None]:
grid_search.fit(train_vectors, train_dep)

In [6]:
best_estimator = grid_search.best_estimator_

In [7]:
dev_predictions = best_estimator.predict(dev_vectors)

precision = precision_score(dev_dep, dev_predictions)
recall = recall_score(dev_dep, dev_predictions)
f1 = f1_score(dev_dep, dev_predictions)
accuracy = accuracy_score(dev_dep, dev_predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")
print(f"Accuracy: {accuracy}")

Precision: 0.9813829787234043
Recall: 0.9413265306122449
F1: 0.9609375
Accuracy: 0.9611901681759379


In [8]:
best_estimator.score(test_vectors, test_dep)

0.9599483204134367