In [72]:
# Import packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import utils.text_processing as util
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from scipy import sparse, stats
import matplotlib.pyplot as plt

In [73]:
#Parameters
OUTPUT_CLASSES = 22
FILE_NAME = "../Data/processed_data.csv"
TEST_SIZE_PERCENTAGE = 0.2
Y_LABEL_NAME = "username"
TEXT_LABEL_NAME ="raw_text"
NUMBER_K_FOLDS = 3

In [74]:
data = pd.read_csv(FILE_NAME)
util.down_nltk_stopwords()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leobl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [75]:
#Get text and y label
text = data[TEXT_LABEL_NAME]
author = data[Y_LABEL_NAME]

In [76]:
text_train, text_test, author_train, author_test = train_test_split(text, author, test_size = TEST_SIZE_PERCENTAGE, random_state = 5)

In [77]:
# Process data subsets
processed_train = util.process_data(text_train)
processed_test = util.process_data(text_test)

In [78]:
# Create bag of words features
## Fit Tfidf Vectorizer
vectorizer = TfidfVectorizer(strip_accents = 'ascii', stop_words = 'english', min_df = 6)
vectorizer.fit(processed_train)

# Get size of vocabulary
print('Vocabulary size: ', len(vectorizer.vocabulary_))

# Create feature vectors
words_train = vectorizer.transform(processed_train)
words_test = vectorizer.transform(processed_test)

Vocabulary size:  3144


In [79]:
np.random.seed(28)
kf = KFold(n_splits = NUMBER_K_FOLDS)

#Metrics 
test_accuracy_list = []
prec_list = []
recall_list = []
f1_list = []
training_time_list = []
prediction_time_list = []

run = 0

# Convert sparse matrix to array
words_train_np = words_train.toarray()

for train_inds, _ in kf.split(words_train):
    run += 1
    print('Run:', run)
       
    # Create data subsets
    train_x = np.array([words_train_np[i] for i in train_inds])
    train_y = [author_train.to_numpy()[i] for i in train_inds]
    
    # Convert train_x back to sparse matrix
    train_x = sparse.csr_matrix(train_x)
    
    # Fit model
    t0 = time.time()
    model = SVC(C = 5, kernel = 'linear')
    model.fit(train_x, train_y)

    t1 = time.time()
    # Predict values for test set
    author_pred2 = model.predict(words_test)

    t2 = time.time()

    # Evaluate
    test_accuracy = accuracy_score(author_test, author_pred2)
    precision, recall, f1, support = score(author_test, author_pred2)
    ave_precision = np.average(precision, weights = support/np.sum(support))
    ave_recall = np.average(recall, weights = support/np.sum(support))
    ave_f1 = np.average(f1, weights = support/np.sum(support))
    training_time = (t1 - t0)
    prediction_time = (t2 - t1)

    test_accuracy_list.append(test_accuracy)
    prec_list.append(ave_precision)
    recall_list.append(ave_recall)
    f1_list.append(ave_f1)
    training_time_list.append(training_time)
    prediction_time_list.append(prediction_time)

print("Test Accuracy:", test_accuracy_list)
print("Ave. Precision:", prec_list)
print("Ave. Recall:", recall_list)
print("Ave. F1 Score:", f1_list)

Run: 1


  _warn_prf(average, modifier, msg_start, len(result))


Run: 2


  _warn_prf(average, modifier, msg_start, len(result))


Run: 3
Accuracy: [0.6059192825112107, 0.6019730941704036, 0.6019730941704036]
Ave. Precision: [0.6495125484319788, 0.6458531115290364, 0.6477799180744149]
Ave. Recall: [0.605919282511211, 0.6019730941704037, 0.6019730941704037]
Ave. F1 Score: [0.6207107857393509, 0.6167052283679811, 0.6171412293046584]


  _warn_prf(average, modifier, msg_start, len(result))


In [80]:
#Get averages for kfold runs
mean_accuracy_across_kfold = np.mean(test_accuracy_list)
mean_percision_across_kfold = np.mean(prec_list)
mean_recall_across_kfold = np.mean(recall_list)
mean_f1_across_kfold = np.mean(f1_list)


print("Average Accuracy across K-folds:", mean_accuracy_across_kfold)
print("Average Precision across K-folds:", mean_percision_across_kfold)
print("Average Recall across K-folds:", mean_recall_across_kfold)
print("Average F1 Score across K-folds:", mean_f1_across_kfold)
print("Average Training Time across K-folds:", (t1 - t0), "seconds")
print("Prediction Time across K-folds:", (t2 - t1), "seconds")



Average Accuracy across K-folds: 0.603288490284006
Average Precision across K-folds: 0.6477151926784767
Average Recall across K-folds: 0.603288490284006
Average F1 Score across K-folds: 0.6181857478039968
Average Training Time across K-folds: 8.776155948638916 seconds
Prediction Time across K-folds: 2.631319046020508 seconds
