In [120]:
#### SETUP ####

import csv
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import numpy as np
np.random.seed(42)
import random
random.seed(42)

# READ IN THE TRAINING DATA

X_txt = []
y = []
with open('./train.tsv', encoding = 'utf-8') as in_file:
    iCSV = csv.reader(in_file, delimiter='\t')
    for row in iCSV:
        X_txt.append(row[1])
        y.append(row[2])

# SPLIT THE TRAINING DATA INTO TRAINING (80%) AND VALIDATION (20%) SUBSETS        

X_txt_train, X_txt_val, y_train, y_val = train_test_split(X_txt, y, test_size=0.2, random_state=42)
print(len(X_txt_train))
print(len(X_txt_val))
print(len(y_train))
print(len(y_val))

8473
2119
8473
2119


In [2]:
# What is the breakdown of the 3 classes in our training data?

import pandas
pandas.Series(y).value_counts()

NOT    7069
TIN    3102
UNT     421
dtype: int64

In [3]:
#### TRAIN SOME BASELINE MODELS ####

# LINEAR SVC

pipeline = Pipeline([
    ('vec', CountVectorizer()), 
    ('clf', LinearSVC(random_state=42))])

params = {'vec__ngram_range':[(1,1),(1,2)],
          'vec__stop_words':['english', None], 
          #'vec__lowercase':[False, True],
          'vec__min_df':[1, 5, 10],
          'clf__C':[0.01, 0.1, 1.]}
clf = GridSearchCV(pipeline, params, scoring="f1_micro", cv=3, verbose=1)
clf.fit(X_txt_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  1.9min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vec', CountVectorizer()),
                                       ('clf', LinearSVC(random_state=42))]),
             param_grid={'clf__C': [0.01, 0.1, 1.0], 'vec__min_df': [1, 5, 10],
                         'vec__ngram_range': [(1, 1), (1, 2)],
                         'vec__stop_words': ['english', None]},
             scoring='f1_micro', verbose=1)

In [4]:
print("Best LinearSVC Micro F1: {:.4f}".format(clf.best_score_))
print("Best LinearSVC Parameters:", clf.best_params_)

preds = clf.predict(X_txt_val)
print("Validation LinearSVC Micro F1: {:.4f}".format(f1_score(y_val, preds, average='micro')))
print("Validation LinearSVC Macro F1: {:.4f}".format(f1_score(y_val, preds, average='macro')))

Best LinearSVC Micro F1: 0.7324
Best LinearSVC Parameters: {'clf__C': 0.1, 'vec__min_df': 1, 'vec__ngram_range': (1, 1), 'vec__stop_words': 'english'}
Validation LinearSVC Micro F1: 0.7268
Validation LinearSVC Macro F1: 0.4487


In [5]:
# RANDOM FOREST

from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('vec', CountVectorizer()), 
    ('clf', RandomForestClassifier(random_state=42))])

params = {'vec__ngram_range':[(1,1),(1,2)],
          'vec__stop_words':['english', None], 
          #'vec__lowercase':[False, True],
          'vec__min_df':[1, 5, 10],
          'clf__n_estimators':[100, 200, 300]}
clf = GridSearchCV(pipeline, params, scoring="f1_micro", cv=3, verbose=1)
clf.fit(X_txt_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed: 14.7min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vec', CountVectorizer()),
                                       ('clf',
                                        RandomForestClassifier(random_state=42))]),
             param_grid={'clf__n_estimators': [100, 200, 300],
                         'vec__min_df': [1, 5, 10],
                         'vec__ngram_range': [(1, 1), (1, 2)],
                         'vec__stop_words': ['english', None]},
             scoring='f1_micro', verbose=1)

In [6]:
print("Best RandomForest Micro F1: {:.4f}".format(clf.best_score_))
print("Best RandomForest Parameters:", clf.best_params_)

preds = clf.predict(X_txt_val)
print("Validation RandomForest Micro F1: {:.4f}".format(f1_score(y_val, preds, average='micro')))
print("Validation RandomForest Macro F1: {:.4f}".format(f1_score(y_val, preds, average='macro')))

Best RandomForest Micro F1: 0.7355
Best RandomForest Parameters: {'clf__n_estimators': 300, 'vec__min_df': 5, 'vec__ngram_range': (1, 1), 'vec__stop_words': None}
Validation RandomForest Micro F1: 0.7348
Validation RandomForest Macro F1: 0.4699


In [10]:
#print(clf.cv_results_)

In [9]:
#plot.grid_search(clf.cv_results_, change='n_estimators', kind='bar')

In [14]:
# Examine some example tweets and their classes
count = 0
for a,b in zip(X_txt, y):
    print(a,b)
    count += 1
    if count == 10:
        break

@USER She should ask a few native Americans what their take on this is. UNT
@USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL TIN
Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT NOT
@USER Someone should'veTaken" this piece of shit to a volcano. 😂" UNT
@USER @USER Obama wanted liberals &amp; illegals to move into red states NOT
@USER Liberals are all Kookoo !!! TIN
@USER @USER Oh noes! Tough shit. UNT
@USER was literally just talking about this lol all mass shootings like that have been set ups. it’s propaganda used to divide us on major issues like gun control and terrorism TIN
@USER Buy more icecream!!! NOT
@USER Canada doesn’t need another CUCK! We already have enough #LooneyLeft #Liberals f**king up our great country! #Qproofs #TrudeauMustGo TIN


In [77]:
# Add lexicon-based features

class LexiconClassifier():
    def __init__(self):
        self.bad_words = set()
        with open('bad-words.txt', encoding = 'utf-8') as iFile:
            for row in iFile:
                self.bad_words.add(row.strip())

    def count_bad_words(self, sentence):
        num_bad_words = 0
        for word in sentence.lower().split():
            # strip away anything not typical for a word 
            # (i.e., digits, most punctuation)
            word = re.findall(r"[A-Za-z'-]+", word)
            if len(word) > 0: 
                word = word[0]
            else:
                word = ""
            if word in self.bad_words:
                num_bad_words += 1
        return num_bad_words

In [165]:
import re

lc = LexiconClassifier()
X_train_lexicon_features = [] # Initialize to an empty list. 
                              # This will be a list of lists
X_val_lexicon_features = [] # Initialize to an empty list.
                             # This will be a list of lists
    
for string in X_txt_val:
    bad_count = lc.count_bad_words(string)  # Count bad words
    message_length = len(string)  # Characters in tweet
    words = string.split()  # Split into words
    avg_word_length = 0
    max_word_length = 0
    for word in words:
        avg_word_length += len(word)/len(words)  # Average word length
        if len(word) > max_word_length:
            max_word_length = len(word)  # Max word length
    num_unique = len(set(string))  # Number of unique characters
    prop_unique = num_unique/message_length  # Proportion of unique characters
    num_letters = len(re.findall(r'[A-Za-z]',string))  # Number of letters
    prop_letters = num_letters/message_length  # Proportion of letters
    num_digits = len(re.findall(r'[0-9]',string))  # Number of digits
    prop_digits = num_digits/message_length  # Proportion of digits
    num_punct = len(re.findall(r'[:punct:]',string))  # Number of punctuation
    prop_punct = num_punct/message_length  # Proportion of punctuation
    num_space = len(re.findall(r'[ ]',string))  # Number of spaces
    prop_space = num_space/message_length  # Proportion of spaces
    num_atUSER = len(re.findall(r'@USER',string))  # Number of @USER
    num_CAPS = len(re.findall(r'[A-Z]',string)) - 4*num_atUSER  
        # Number of capital letters (removing @USER)
    prop_CAPS = num_CAPS/message_length  # Proportion of capital letters
    num_at = len(re.findall(r'[@]',string))  # Number of @
    prop_at = num_at/message_length  # Proportion of @
    num_exclam = len(re.findall(r'[!]',string))  # Number of !
    prop_exclam = num_exclam/message_length  # Proportion of !
    num_question = len(re.findall(r'[?]',string))  # Number of ?
    prop_question = num_question/message_length  # Proportion of ?
    num_exclam_seq = len(re.findall(r'[!]{2,}',string))  # Number of ! sequences 2+
    num_quest_seq = len(re.findall(r'[?]{2,}',string))  # Number of ? sequences 2+
    num_exclam_quest_seq = len(re.findall(r'([!]|[?])*(([!][?])|([?][!]))([!]|[?])*',string))  # Number of !/? sequences 2+
    X_val_lexicon_features.append([bad_count, message_length, avg_word_length, max_word_length, 
                                   num_unique, prop_unique, num_letters, prop_letters, num_digits, 
                                   prop_digits, num_punct, prop_punct, num_space, prop_space, num_CAPS,
                                   prop_CAPS, num_at, prop_at, num_exclam, prop_exclam, num_question,
                                   prop_question, num_exclam_seq, num_quest_seq, num_exclam_quest_seq])

for string in X_txt_train:
    bad_count = lc.count_bad_words(string)  # Count bad words
    message_length = len(string)  # Characters in tweet
    words = string.split()  # Split into words
    avg_word_length = 0
    max_word_length = 0
    for word in words:
        avg_word_length += len(word)/len(words)  # Average word length
        if len(word) > max_word_length:
            max_word_length = len(word)  # Max word length
    num_unique = len(set(string))  # Number of unique characters
    prop_unique = num_unique/message_length  # Proportion of unique characters
    num_letters = len(re.findall(r'[A-Za-z]',string))  # Number of letters
    prop_letters = num_letters/message_length  # Proportion of letters
    num_digits = len(re.findall(r'[0-9]',string))  # Number of digits
    prop_digits = num_digits/message_length  # Proportion of digits
    num_punct = len(re.findall(r'[:punct:]',string))  # Number of punctuation
    prop_punct = num_punct/message_length  # Proportion of punctuation
    num_space = len(re.findall(r'[ ]',string))  # Number of spaces
    prop_space = num_space/message_length  # Proportion of spaces
    num_atUSER = len(re.findall(r'@USER',string))  # Number of @USER
    num_CAPS = len(re.findall(r'[A-Z]',string)) - 4*num_atUSER  
        # Number of capital letters (removing @USER)
    prop_CAPS = num_CAPS/message_length  # Proportion of capital letters
    num_at = len(re.findall(r'[@]',string))  # Number of @
    prop_at = num_at/message_length  # Proportion of @
    num_exclam = len(re.findall(r'[!]',string))  # Number of !
    prop_exclam = num_exclam/message_length  # Proportion of !
    num_question = len(re.findall(r'[?]',string))  # Number of ?
    prop_question = num_question/message_length  # Proportion of ?
    num_exclam_seq = len(re.findall(r'[!]{2,}',string))  # Number of ! sequences 2+
    num_quest_seq = len(re.findall(r'[?]{2,}',string))  # Number of ? sequences 2+
    num_exclam_quest_seq = len(re.findall(r'([!]|[?])*(([!][?])|([?][!]))([!]|[?])*',string))  # Number of !/? sequences 2+
    X_train_lexicon_features.append([bad_count, message_length, avg_word_length, max_word_length, 
                                     num_unique, prop_unique, num_letters, prop_letters, num_digits, 
                                     prop_digits, num_punct, prop_punct, num_space, prop_space, num_CAPS,
                                     prop_CAPS, num_at, prop_at, num_exclam, prop_exclam, num_question,
                                     prop_question, num_exclam_seq, num_quest_seq, num_exclam_quest_seq])

In [166]:
print(X_txt_train[0])
print(X_train_lexicon_features[0])
print(X_txt_train[1])
print(X_train_lexicon_features[1])
print(X_txt_train[2])
print(X_train_lexicon_features[2])
print(X_txt_train[10])
print(X_train_lexicon_features[10])

@USER @USER @USER You are grossly misinterpreting who is against this. People across all parties condemn this action. Look no further than Bill Davis and Brian Mulroney to see that even conservatives think this is a horrible decision.
[0, 234, 5.1842105263157885, 15, 35, 0.14957264957264957, 191, 0.8162393162393162, 0, 0.0, 41, 0.1752136752136752, 37, 0.1581196581196581, 7, 0.029914529914529916, 3, 0.01282051282051282, 0, 0.0, 0, 0.0, 0, 0, 0]
@USER Children  should be seen and not heard!!!
[0, 47, 4.875, 8, 21, 0.44680851063829785, 35, 0.7446808510638298, 0, 0.0, 6, 0.1276595744680851, 8, 0.1702127659574468, 1, 0.02127659574468085, 1, 0.02127659574468085, 3, 0.06382978723404255, 0, 0.0, 1, 0, 0]
@USER They've morphed into Antifa. Antifa gets better coverage and that all important Soros cash! ie: I'm so upset about you know racist stuff and Trump and children in cages and everything the like that, ya know?""
[1, 215, 4.837837837837838, 10, 38, 0.17674418604651163, 169, 0.78604651162790

In [167]:
import scipy.sparse as sp

vec = CountVectorizer(ngram_range=(1,1), min_df=1, stop_words='english')
X_train = vec.fit_transform(X_txt_train) # This should be a matrix 
X_val = vec.transform(X_txt_val) # This should be a matrix

X_train_lexicon_features = np.array(X_train_lexicon_features)
X_val_lexicon_features = np.array(X_val_lexicon_features)
X_train_w_lex = sp.hstack([X_train_lexicon_features, X_train])
X_val_w_lex = sp.hstack([X_val_lexicon_features, X_val])

In [168]:
#### TRAIN SOME MODELS WITH MORPHOLOGICAL FEATURES ADDED ####

# LINEAR SVC

params = {'C':[0.0001, 0.001, 0.01, 0.1, 1., 10., 100.]}

lsvc = LinearSVC(random_state=42)
grid = GridSearchCV(lsvc, params, scoring="f1_micro", cv=5, verbose = 1)
grid.fit(X_train_w_lex, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:  1.0min finished


GridSearchCV(cv=5, estimator=LinearSVC(random_state=42),
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
             scoring='f1_micro', verbose=1)

In [169]:
print("Best LinearSVC Micro F1: {:.4f}".format(grid.best_score_))
print("Best LinearSVC Parameters:", grid.best_params_)

preds = grid.predict(X_val_w_lex)
print("Validation LinearSVC Micro F1: {:.4f}".format(f1_score(y_val, preds, average='micro')))
print("Validation LinearSVC Macro F1: {:.4f}".format(f1_score(y_val, preds, average='macro')))

Best LinearSVC Micro F1: 0.7133
Best LinearSVC Parameters: {'C': 0.01}
Validation LinearSVC Micro F1: 0.7324
Validation LinearSVC Macro F1: 0.4385


In [170]:
import scipy.sparse as sp

vec = CountVectorizer(ngram_range=(1,1), min_df=5, stop_words=None)
X_train = vec.fit_transform(X_txt_train) # This should be a matrix 
X_val = vec.transform(X_txt_val) # This should be a matrix

X_train_lexicon_features = np.array(X_train_lexicon_features)
X_val_lexicon_features = np.array(X_val_lexicon_features)
X_train_w_lex = sp.hstack([X_train_lexicon_features, X_train])
X_val_w_lex = sp.hstack([X_val_lexicon_features, X_val])

In [171]:
# RANDOM FOREST

from sklearn.ensemble import RandomForestClassifier

params = {'n_estimators':[100, 200, 300]}

rfc = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rfc, params, scoring="f1_micro", cv=5, verbose = 1)
grid.fit(X_train_w_lex, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  1.7min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'n_estimators': [100, 200, 300]}, scoring='f1_micro',
             verbose=1)

In [172]:
print("Best RandomForest Micro F1: {:.4f}".format(grid.best_score_))
print("Best RandomForest Parameters:", grid.best_params_)

preds = grid.predict(X_val_w_lex)
print("Validation RandomForest Micro F1: {:.4f}".format(f1_score(y_val, preds, average='micro')))
print("Validation RandomForest Macro F1: {:.4f}".format(f1_score(y_val, preds, average='macro')))

Best RandomForest Micro F1: 0.7221
Best RandomForest Parameters: {'n_estimators': 300}
Validation RandomForest Micro F1: 0.7263
Validation RandomForest Macro F1: 0.4347


In [57]:
# Playing around with morphological features
# Not part of workflow

string = X_txt_val[2]
print(string)
print(len(string))
words = string.split()
avg_word_length = 0
max_word_length = 0
for word in words:
    avg_word_length += len(word)/len(words)
    if len(word) > max_word_length:
        max_word_length = len(word)
print(avg_word_length)
print(max_word_length)
print(len(re.findall(r'[A-Za-z]',string)))
print(len(re.findall(r'[0-9]',string)))
print(len(re.findall(r'[:punct:]',string)))
print(len(re.findall(r'[ ]',string)))
print(len(re.findall(r'[A-Z]',string)))
print(len(re.findall(r'[@]',string)))
print(len(re.findall(r'[!]',string)))

@USER i’m weak 😂😂.. Shit must have been really painful ..
57
4.272727272727273
7
39
0
7
10
5
1
0


In [97]:
# Playing around with morphological features
# Not part of workflow
string = "So happy!?!? How about you???????????!"
num_exclam_seq = len(re.findall(r'[!]{2,}',string))  # Number of ! sequences 2+
num_quest_seq = len(re.findall(r'[?]{2,}',string))  # Number of ? sequences 2+
num_exclam_quest_seq = len(re.findall(r'([!]|[?])*(([!][?])|([?][!]))([!]|[?])*',string))  # Number of !/? sequences 2+
print(num_exclam_seq)
print(num_quest_seq)
print(num_exclam_quest_seq)

0
1
2


In [119]:
string = "@poop#%&%"
lc.count_bad_words(string)

1

In [158]:
#### TRY TFIDF INSTEAD OF COUNT ####

# LINEAR SVC

pipeline = Pipeline([
    ('vec', TfidfVectorizer()), 
    ('clf', LinearSVC(random_state=42))])

params = {'vec__ngram_range':[(1,1),(1,2)],
          'vec__stop_words':['english', None], 
          #'vec__lowercase':[False, True],
          'vec__min_df':[1, 5, 10],
          'clf__C':[0.01, 0.1, 1.]}
clf = GridSearchCV(pipeline, params, scoring="f1_micro", cv=5, verbose=1)
clf.fit(X_txt_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:   46.2s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vec', TfidfVectorizer()),
                                       ('clf', LinearSVC(random_state=42))]),
             param_grid={'clf__C': [0.01, 0.1, 1.0], 'vec__min_df': [1, 5, 10],
                         'vec__ngram_range': [(1, 1), (1, 2)],
                         'vec__stop_words': ['english', None]},
             scoring='f1_micro', verbose=1)

In [159]:
print("Best LinearSVC Micro F1: {:.4f}".format(clf.best_score_))
print("Best LinearSVC Parameters:", clf.best_params_)

preds = clf.predict(X_txt_val)
print("Validation LinearSVC Micro F1: {:.4f}".format(f1_score(y_val, preds, average='micro')))
print("Validation LinearSVC Macro F1: {:.4f}".format(f1_score(y_val, preds, average='macro')))

Best LinearSVC Micro F1: 0.7329
Best LinearSVC Parameters: {'clf__C': 1.0, 'vec__min_df': 1, 'vec__ngram_range': (1, 2), 'vec__stop_words': 'english'}
Validation LinearSVC Micro F1: 0.7282
Validation LinearSVC Macro F1: 0.4815


In [156]:
# RANDOM FOREST

from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('vec', TfidfVectorizer()), 
    ('clf', RandomForestClassifier(random_state=42))])

params = {'vec__ngram_range':[(1,1),(1,2)],
          'vec__stop_words':['english', None], 
          #'vec__lowercase':[False, True],
          'vec__min_df':[1, 5, 10],
          'clf__n_estimators':[100, 200, 300]}
clf = GridSearchCV(pipeline, params, scoring="f1_micro", cv=5, verbose=1)
clf.fit(X_txt_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 29.5min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vec', TfidfVectorizer()),
                                       ('clf',
                                        RandomForestClassifier(random_state=42))]),
             param_grid={'clf__n_estimators': [100, 200, 300],
                         'vec__min_df': [1, 5, 10],
                         'vec__ngram_range': [(1, 1), (1, 2)],
                         'vec__stop_words': ['english', None]},
             scoring='f1_micro', verbose=1)

In [157]:
print("Best RandomForest Micro F1: {:.4f}".format(clf.best_score_))
print("Best RandomForest Parameters:", clf.best_params_)

preds = clf.predict(X_txt_val)
print("Validation RandomForest Micro F1: {:.4f}".format(f1_score(y_val, preds, average='micro')))
print("Validation RandomForest Macro F1: {:.4f}".format(f1_score(y_val, preds, average='macro')))

Best RandomForest Micro F1: 0.7398
Best RandomForest Parameters: {'clf__n_estimators': 200, 'vec__min_df': 5, 'vec__ngram_range': (1, 1), 'vec__stop_words': 'english'}
Validation RandomForest Micro F1: 0.7381
Validation RandomForest Macro F1: 0.4876


In [153]:
import scipy.sparse as sp

vec = TfidfVectorizer(ngram_range=(1,1), min_df=1, stop_words='english')
X_train = vec.fit_transform(X_txt_train) # This should be a matrix 
X_val = vec.transform(X_txt_val) # This should be a matrix

X_train_lexicon_features = np.array(X_train_lexicon_features)
X_val_lexicon_features = np.array(X_val_lexicon_features)
X_train_w_lex = sp.hstack([X_train_lexicon_features, X_train])
X_val_w_lex = sp.hstack([X_val_lexicon_features, X_val])

In [154]:
#### TFIDF WITH MORPHOLOGICAL FEATURES ADDED ####

# LINEAR SVC

params = {'C':[0.0001, 0.001, 0.01, 0.1, 1., 10., 100.]}

lsvc = LinearSVC(random_state=42)
grid = GridSearchCV(lsvc, params, scoring="f1_micro", cv=5, verbose = 1)
grid.fit(X_train_w_lex, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:  1.0min finished


GridSearchCV(cv=5, estimator=LinearSVC(random_state=42),
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
             scoring='f1_micro', verbose=1)

In [155]:
print("Best LinearSVC Micro F1: {:.4f}".format(grid.best_score_))
print("Best LinearSVC Parameters:", grid.best_params_)

preds = grid.predict(X_val_w_lex)
print("Validation LinearSVC Micro F1: {:.4f}".format(f1_score(y_val, preds, average='micro')))
print("Validation LinearSVC Macro F1: {:.4f}".format(f1_score(y_val, preds, average='macro')))

Best LinearSVC Micro F1: 0.6812
Best LinearSVC Parameters: {'C': 0.01}
Validation LinearSVC Micro F1: 0.6947
Validation LinearSVC Macro F1: 0.3818


In [173]:
import scipy.sparse as sp

vec = TfidfVectorizer(ngram_range=(1,1), min_df=5, stop_words='english')
X_train = vec.fit_transform(X_txt_train) # This should be a matrix 
X_val = vec.transform(X_txt_val) # This should be a matrix

X_train_lexicon_features = np.array(X_train_lexicon_features)
X_val_lexicon_features = np.array(X_val_lexicon_features)
X_train_w_lex = sp.hstack([X_train_lexicon_features, X_train])
X_val_w_lex = sp.hstack([X_val_lexicon_features, X_val])

In [174]:
# RANDOM FOREST

from sklearn.ensemble import RandomForestClassifier

params = {'n_estimators':[100, 200, 300]}

rfc = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rfc, params, scoring="f1_micro", cv=5, verbose = 1)
grid.fit(X_train_w_lex, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  1.6min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'n_estimators': [100, 200, 300]}, scoring='f1_micro',
             verbose=1)

In [176]:
print("Best RandomForest Micro F1: {:.4f}".format(grid.best_score_))
print("Best RandomForest Parameters:", grid.best_params_)

preds = grid.predict(X_val_w_lex)
print("Validation RandomForest Micro F1: {:.4f}".format(f1_score(y_val, preds, average='micro')))
print("Validation RandomForest Macro F1: {:.4f}".format(f1_score(y_val, preds, average='macro')))

Best RandomForest Micro F1: 0.7282
Best RandomForest Parameters: {'n_estimators': 200}
Validation RandomForest Micro F1: 0.7272
Validation RandomForest Macro F1: 0.4335


In [142]:
# MULTI-LAYER PERCEPTRON (NEURAL NETWORK)

## THIS CODE DOES NOT WORK

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
#    ('vec', TfidfVectorizer()), 
    ('scaler', StandardScaler()),
    ('clf', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=42))])

params = {#'vec__ngram_range':[(1,1),(1,2)],
          #'vec__stop_words':['english', None], 
          #'vec__lowercase':[False, True],
          #'vec__min_df':[1, 5, 10],
          'clf__hidden_layer_sizes':[(5,2)]}
clf = GridSearchCV(pipeline, params, scoring="f1_micro", cv=3, verbose=1)
#clf.fit(X_txt_train, y_train)
#clf.fit(X_train_lexicon_features, y_train)

In [138]:
# MULTI-LAYER PERCEPTRON (NEURAL NETWORK)

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

mlp = MLPClassifier(random_state=42, hidden_layer_sizes=(15,5), max_iter=300)
mlp.fit(X_train_w_lex, y_train)

MLPClassifier(hidden_layer_sizes=(15, 5), max_iter=300, random_state=42)

In [139]:
preds = mlp.predict(X_val_w_lex)
print("Validation MLP Micro F1: {:.4f}".format(f1_score(y_val, preds, average='micro')))
print("Validation MLP Macro F1: {:.4f}".format(f1_score(y_val, preds, average='macro')))

Validation MLP Micro F1: 0.6583
Validation MLP Macro F1: 0.4745


In [143]:
#### EXPLORE USING SelectKBest ####

from sklearn.feature_selection import SelectKBest, chi2

# LINEAR SVC

pipeline = Pipeline([
    ('vec', TfidfVectorizer()), 
    ('skbest', SelectKBest(chi2)),
    ('clf', LinearSVC(random_state=42))])

params = {'vec__ngram_range':[(1,1),(1,2)],
          'vec__stop_words':['english', None], 
          #'vec__lowercase':[False, True],
          'vec__min_df':[1, 5, 10],
          'skbest__k':[10,100,1000,'all'],
          'clf__C':[0.01, 0.1, 1.]}
clf = GridSearchCV(pipeline, params, scoring="f1_micro", cv=3, verbose=1)
clf.fit(X_txt_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Traceback (most recent call last):
  File "/Users/Ben/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/Ben/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/Ben/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 292, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/Ben/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "/Users/Ben/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/Users/Ben/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 693, in f

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vec', TfidfVectorizer()),
                                       ('skbest',
                                        SelectKBest(score_func=<function chi2 at 0x7fb9944f35e0>)),
                                       ('clf', LinearSVC(random_state=42))]),
             param_grid={'clf__C': [0.01, 0.1, 1.0],
                         'skbest__k': [10, 100, 1000, 'all'],
                         'vec__min_df': [1, 5, 10],
                         'vec__ngram_range': [(1, 1), (1, 2)],
                         'vec__stop_words': ['english', None]},
             scoring='f1_micro', verbose=1)

In [144]:
print("Best LinearSVC Micro F1: {:.4f}".format(clf.best_score_))
print("Best LinearSVC Parameters:", clf.best_params_)

preds = clf.predict(X_txt_val)
print("Validation LinearSVC Micro F1: {:.4f}".format(f1_score(y_val, preds, average='micro')))
print("Validation LinearSVC Macro F1: {:.4f}".format(f1_score(y_val, preds, average='macro')))

Best LinearSVC Micro F1: 0.7354
Best LinearSVC Parameters: {'clf__C': 1.0, 'skbest__k': 1000, 'vec__min_df': 1, 'vec__ngram_range': (1, 1), 'vec__stop_words': 'english'}
Validation LinearSVC Micro F1: 0.7253
Validation LinearSVC Macro F1: 0.4443


In [147]:
# LINEAR SVC

import scipy.sparse as sp

vec = TfidfVectorizer(ngram_range=(1,1), min_df=1, stop_words='english')
X_train = vec.fit_transform(X_txt_train) # This should be a matrix 
X_val = vec.transform(X_txt_val) # This should be a matrix

X_train_lexicon_features = np.array(X_train_lexicon_features)
X_val_lexicon_features = np.array(X_val_lexicon_features)
X_train_w_lex = sp.hstack([X_train_lexicon_features, X_train])
X_val_w_lex = sp.hstack([X_val_lexicon_features, X_val])

pipeline = Pipeline([
    ('skbest', SelectKBest(chi2)),
    ('clf', LinearSVC(random_state=42, max_iter=2000))])

params = {'skbest__k':[10,100,1000,'all'],
          'clf__C':[0.001, 0.01, 0.1, 1., 10., 100.]}
clf = GridSearchCV(pipeline, params, scoring="f1_micro", cv=3, verbose=1)
clf.fit(X_train_w_lex, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:  2.5min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('skbest',
                                        SelectKBest(score_func=<function chi2 at 0x7fb9944f35e0>)),
                                       ('clf',
                                        LinearSVC(max_iter=2000,
                                                  random_state=42))]),
             param_grid={'clf__C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
                         'skbest__k': [10, 100, 1000, 'all']},
             scoring='f1_micro', verbose=1)

In [149]:
print("Best LinearSVC Micro F1: {:.4f}".format(clf.best_score_))
print("Best LinearSVC Parameters:", clf.best_params_)

preds = clf.predict(X_val_w_lex)
print("Validation LinearSVC Micro F1: {:.4f}".format(f1_score(y_val, preds, average='micro')))
print("Validation LinearSVC Macro F1: {:.4f}".format(f1_score(y_val, preds, average='macro')))

Best LinearSVC Micro F1: 0.6941
Best LinearSVC Parameters: {'clf__C': 0.1, 'skbest__k': 'all'}
Validation LinearSVC Micro F1: 0.7244
Validation LinearSVC Macro F1: 0.4357
