In [114]:
# Import packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import utils.text_processing as util
import time
from sklearn.metrics import accuracy_score, make_scorer, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from scipy import sparse, stats

In [115]:
#Parameters
OUTPUT_CLASSES = 22
FILE_NAME = "../Data/processed_data.csv"
TEST_SIZE_PERCENTAGE = 0.2
CANDIDATES = [1, 10, 100]
Y_LABEL_NAME = "username"
TEXT_LABEL_NAME ="raw_text"

In [116]:
data = pd.read_csv(FILE_NAME)
util.down_nltk_stopwords()
data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leobl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0.1,Unnamed: 0,raw_text,username,syllables,periods,hyphens,commas,exclamations,questions,quotes,...,replies,retweets,links,smiles,bigsmiles,winks,bigwinks,unsures,semicolons,hashtags
0,4,"""Appreciate a pair of nice titties",0laotan,11,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,6,"""son",0laotan,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,10,me nutting in her means no one else can get h...,0laotan,21,2,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
3,12,Her being a good person means she's entitled ...,0laotan,24,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,13,this. https://t.co/WiUKzhqXp1,0laotan,3,2,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [117]:
#Get text and y label
text = data[TEXT_LABEL_NAME]
author = data[Y_LABEL_NAME]

In [118]:
text_train, text_test, author_train, author_test = train_test_split(text, author, test_size = TEST_SIZE_PERCENTAGE, random_state = 5)

In [119]:
# Process data subsets
processed_train = util.process_data(text_train)
processed_test = util.process_data(text_test)

print(processed_train[0:5])

['@lumen wait is it realli', 'not even safe get educ anymor https://t.co/yhsotejuyz', '"man camilla!soleil like', "most peopl gonna wait jan. 2 start make differ 2016. i'm next year mode now. #letsgo #letsgetit #dreamchas", 'hella stupid boy 😅']


In [120]:
# Create bag of words features
## Fit Tfidf Vectorizer
vectorizer = TfidfVectorizer(strip_accents = 'ascii', stop_words = 'english', min_df = 6)
vectorizer.fit(processed_train)

# Get size of vocabulary
print('Vocabulary size: ', len(vectorizer.vocabulary_))

# Create feature vectors
words_train = vectorizer.transform(processed_train)
words_test = vectorizer.transform(processed_test)

Vocabulary size:  3144


In [121]:
# Define grid search object
svm = SVC()
params = {'kernel': ['linear'], 'C':[1, 10, 100]}
scorer = make_scorer(accuracy_score)

grid_obj = GridSearchCV(svm, params, scoring = scorer, verbose = 50)

In [122]:
# Fit bag of words svm
np.random.seed(6)
word_svm = grid_obj.fit(words_train, author_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5; 1/3] START C=1, kernel=linear..........................................
[CV 1/5; 1/3] END ...........C=1, kernel=linear;, score=0.639 total time=  14.7s
[CV 2/5; 1/3] START C=1, kernel=linear..........................................
[CV 2/5; 1/3] END ...........C=1, kernel=linear;, score=0.641 total time=  14.8s
[CV 3/5; 1/3] START C=1, kernel=linear..........................................
[CV 3/5; 1/3] END ...........C=1, kernel=linear;, score=0.638 total time=  14.9s
[CV 4/5; 1/3] START C=1, kernel=linear..........................................
[CV 4/5; 1/3] END ...........C=1, kernel=linear;, score=0.635 total time=  14.8s
[CV 5/5; 1/3] START C=1, kernel=linear..........................................
[CV 5/5; 1/3] END ...........C=1, kernel=linear;, score=0.622 total time=  14.7s
[CV 1/5; 2/3] START C=10, kernel=linear.........................................
[CV 1/5; 2/3] END ..........C=10, kernel=linear;,

In [123]:
print(word_svm.best_estimator_)

SVC(C=1, kernel='linear')


In [124]:
print(word_svm.cv_results_)

{'mean_fit_time': array([12.35447154, 12.21911421, 14.97774739]), 'std_fit_time': array([0.04520915, 0.17658086, 0.45789872]), 'mean_score_time': array([2.52233334, 2.44381447, 2.45191398]), 'std_score_time': array([0.02708318, 0.03170739, 0.03723776]), 'param_C': masked_array(data=[1, 10, 100],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'param_kernel': masked_array(data=['linear', 'linear', 'linear'],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 1, 'kernel': 'linear'}, {'C': 10, 'kernel': 'linear'}, {'C': 100, 'kernel': 'linear'}], 'split0_test_score': array([0.63946188, 0.59484305, 0.56524664]), 'split1_test_score': array([0.64125561, 0.59820628, 0.56950673]), 'split2_test_score': array([0.63811659, 0.59147982, 0.5706278 ]), 'split3_test_score': array([0.63534425, 0.58914555, 0.56290648]), 'split4_test_score': array([0.62233685, 0.5913882 , 0.56492487]), 'mean_test_score': a

In [125]:
# Fit and evaluate Model 2 (Bag of words SVM)
np.random.seed(28)

t0 = time.time()

# Fit model
model2 = SVC(C = 1, kernel = 'linear')
model2.fit(words_train, author_train)

t1 = time.time()

# Predict values for test set
author_pred2 = model2.predict(words_test)

t2 = time.time()

# Evaluate
accuracy = accuracy_score(author_test, author_pred2)
precision, recall, f1, support = score(author_test, author_pred2)
ave_precision = np.average(precision, weights = support/np.sum(support))
ave_recall = np.average(recall, weights = support/np.sum(support))
ave_f1 = np.average(f1, weights = support/np.sum(support))
confusion = confusion_matrix(author_test, author_pred2, labels =  data[Y_LABEL_NAME].unique())

print("Accuracy:", accuracy)
print("Ave. Precision:", ave_precision)
print("Ave. Recall:", ave_recall)
print("Ave. F1 Score:", ave_f1)
print("Training Time:", (t1 - t0), "seconds")
print("Prediction Time:", (t2 - t1), "seconds")
print("Confusion Matrix:\n", confusion)

Accuracy: 0.6514798206278027
Ave. Precision: 0.7022408047742655
Ave. Recall: 0.6514798206278029
Ave. F1 Score: 0.6592978441766544
Training Time: 18.214348793029785 seconds
Prediction Time: 3.7535555362701416 seconds
Confusion Matrix:
 [[ 22   1   3  31   0  16  19   3   1   0   0   0   3   0   7   0   2   0
    0   1   7   4]
 [  0 234   1   4   0  14   2   4   0   0   0   0   2   0   7   0   0   2
    0   0   2   0]
 [  2   0  60  14   0  36  14   6   1   0   0   0  12   0  11   0   0   5
    0   0  13   7]
 [  2   1   5 147   1  42  18   5   0   0   0   0   8   0  16   0   5   1
    0   0   8   2]
 [  1   2   1  26  10  13  15   2   0   0   0   0   5   0   4   0   2   1
    0   2   1   4]
 [  0   2   9  39   2 141  27  13   2   0   0   0  17   0  30   0   0   3
    0   1  16  10]
 [  0   1   2  34   0  48 421   4   1   0   0   0  14   0  10   0   0   6
    0  10  19   4]
 [  0   1   8  40   1  58  16 119   0   0   0   0  12   0  15   0   0   2
    0   0   4   4]
 [  4   2   3  13   0

  _warn_prf(average, modifier, msg_start, len(result))


In [126]:
kf = KFold(n_splits = 5)

accuracy_list = []
prec_list = []
recall_list = []
f1_list = []
cnt = 0

# Convert sparse matrix to array
words_train_np = words_train.toarray()

for train_inds, _ in kf.split(words_train):
    cnt += 1
    print('Run:', cnt)
       
    # Create data subsets
    train_x = np.array([words_train_np[i] for i in train_inds])
    train_y = [author_train.to_numpy()[i] for i in train_inds]
    
    # Convert train_x back to sparse matrix
    train_x = sparse.csr_matrix(train_x)
    
    # Fit model
    model2 = SVC(C = 1, kernel = 'linear')
    model2.fit(train_x, train_y)


    # Predict values for test set
    author_pred2 = model2.predict(words_test)

    # Evaluate
    accuracy = accuracy_score(author_test, author_pred2)
    precision, recall, f1, support = score(author_test, author_pred2)
    ave_precision = np.average(precision, weights = support/np.sum(support))
    ave_recall = np.average(recall, weights = support/np.sum(support))
    ave_f1 = np.average(f1, weights = support/np.sum(support))
    
    accuracy_list.append(accuracy)
    prec_list.append(ave_precision)
    recall_list.append(ave_recall)
    f1_list.append(ave_f1)

print("Accuracy:", accuracy_list)
print("Ave. Precision:", prec_list)
print("Ave. Recall:", recall_list)
print("Ave. F1 Score:", f1_list)

Run: 1


  _warn_prf(average, modifier, msg_start, len(result))


Run: 2


  _warn_prf(average, modifier, msg_start, len(result))


Run: 3


  _warn_prf(average, modifier, msg_start, len(result))


Run: 4


  _warn_prf(average, modifier, msg_start, len(result))


Run: 5
Accuracy: [0.6403587443946188, 0.6414349775784753, 0.6376681614349776, 0.6376681614349776, 0.6398206278026906]
Ave. Precision: [0.688486115848937, 0.689682217453364, 0.6816022483052767, 0.6853305337173741, 0.691684844720376]
Ave. Recall: [0.640358744394619, 0.6414349775784755, 0.6376681614349777, 0.6376681614349777, 0.6398206278026908]
Ave. F1 Score: [0.6466686600827178, 0.6482960247153159, 0.6379516056399197, 0.6444719897149782, 0.6464050860871541]


  _warn_prf(average, modifier, msg_start, len(result))


In [127]:
def calculate_averages(true, pred, text):
    """Calculate average length of correctly and incorrectly classified examples
    
    Args:
    true: list. List of correct labels.
    pred: list. List of predicted labels.
    text: list. List of text excerpts.
    
    Returns:
    correct_ave_chars: float. Average length of correctly classified examples in characters.
    incorrect_ave_chars: float. Average length of incorrectly classified examples in characters.
    correct_ave_words: float. Average length of correctly classified examples in characters.
    incorrect_ave_words: float. Average length of incorrectly classified examples in characters.
    """
    
    correct_len_chars = []
    incorrect_len_chars = []
    correct_len_words = []
    incorrect_len_words = []

    
    for i in range(len(true)):
        if true[i] == pred[i]:
            correct_len_chars.append(len(text[i]))
            correct_len_words.append(len(text[i].split()))
        else:
            incorrect_len_chars.append(len(text[i]))
            incorrect_len_words.append(len(text[i].split()))
    
    correct_ave_chars = np.mean(correct_len_chars)
    correct_ave_words = np.mean(correct_len_words)
    incorrect_ave_chars = np.mean(incorrect_len_chars)
    incorrect_ave_words = np.mean(incorrect_len_words)
    
    # Conduct two sample t-test
    print('Character t-test')
    print(stats.ttest_ind(correct_len_chars, incorrect_len_chars, equal_var = False))
    
    print('\nWord t-test')
    print(stats.ttest_ind(correct_len_words, incorrect_len_words, equal_var = False))
    
    return correct_ave_chars, correct_ave_words, incorrect_ave_chars, incorrect_ave_words

In [128]:
len(author_test.to_numpy()[1].split())

1

In [129]:
# Calculate averages for Model 2
correct_ave_chars2, correct_ave_words2, incorrect_ave_chars2, incorrect_ave_words2 = calculate_averages(author_test.to_numpy(), author_pred2, text_test.to_numpy())

Character t-test
Ttest_indResult(statistic=13.92926125327792, pvalue=3.2217212181006924e-43)

Word t-test
Ttest_indResult(statistic=9.893607634357823, pvalue=7.734767972916657e-23)
