In [1]:
import pandas as pd
import numpy as np
import os
import re
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from wordsegment import load, segment
import emoji
from preprocessing import process_single_tweet, process_train_data, process_test_data, process_trial_data
from utils import load_train_data, load_test_data_a, load_trial_data
from feature_embedding import get_all_character_ngrams_of_sentence, build_ngrams_dataset, build_glove_featurized_dataset
from ml_classifiers import MLDetector, run_logistic_regression

In [2]:
train_data = load_train_data()
train_data.head()

number of training data: 13240


Unnamed: 0,tweet,subtask_a
0,@USER She should ask a few native Americans wh...,OFF
1,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF
2,Amazon is investigating Chinese employees who ...,NOT
3,"@USER Someone should'veTaken"" this piece of sh...",OFF
4,@USER @USER Obama wanted liberals &amp; illega...,NOT


## 1. Pre-processing

Result from NLTK twitter tokenizer:

In [3]:
example = train_data["tweet"][1]
print(example)
twt_tokenizer = TweetTokenizer()
tokenized = twt_tokenizer.tokenize(example)
print(tokenized)

@USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL
['@USER', '@USER', 'Go', 'home', 'you', '’', 're', 'drunk', '!', '!', '!', '@USER', '#MAGA', '#Trump2020', '👊', '🇺', '🇸', '👊', 'URL']


Result from our pre-processing:

In [4]:
print(example)
print(process_single_tweet(example))

@USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL
Creating dictionary...
total words processed: 223118
total unique words in corpus: 223118
total items in dictionary (corpus words and deletions): 6778287
  edit distance for deletions: 3
  length of longest word in corpus: 15
go home you be drink ! ! ! magna trump 2020 oncoming fist unite state oncoming fist


## 2. Features

In [3]:
test_data = load_test_data_a()
test_data = process_test_data(test_data)
test_data.head()

number of test data A: 860
Processed data already exists. Direct load it.
number of processed data: 860


Unnamed: 0,subtask_a,tweet
0,OFF,who be q where the server dump nike dec las fi...
1,NOT,constitution day be revere by conservatives ha...
2,NOT,foxnews era magna pot trump and amendment rnc ...
3,NOT,watch boomer get the news that she be still up...
4,OFF,no saharan unity demo to oppose the far right ...


###  Character n-grams

In [3]:
example = "hhhhh so funny !"
print(example)
get_all_character_ngrams_of_sentence(example)    

hhhhh so funny !


{'!': 1,
 'f': 1,
 'fu': 1,
 'fun': 1,
 'funn': 1,
 'h': 5,
 'hh': 4,
 'hhh': 3,
 'hhhh': 2,
 'n': 2,
 'nn': 1,
 'nny': 1,
 'ny': 1,
 'o': 1,
 's': 1,
 'so': 1,
 'u': 1,
 'un': 1,
 'unn': 1,
 'unny': 1,
 'y': 1}

Build training data based on character n-grams:

In [5]:
sample_train = test_data.head(10)
sample_test = test_data.tail(5)
train_set = build_ngrams_dataset(sample_train)
print(train_set['X'])
print("Shape of X is {}".format(train_set['X'].shape))
print(train_set['y'])

[[3. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
Shape of X is (10, 892)
[1, 0, 0, 0, 1, 1, 0, 1, 0, 0]


Build test data using the same vectorizer when building training data:

In [6]:
vectorizer = train_set['vectorizer']
test_set = build_ngrams_dataset(sample_test, vectorizer=vectorizer)
print(test_set['X'])
print(test_set['y'])

[[0. 1. 0. ... 0. 0. 0.]
 [0. 2. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 1. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[1, 0, 1, 0, 0]


### GloVe embedding

Let's use a 25d embedding for illustration:

In [9]:
train_set2 = build_glove_featurized_dataset(sample_train, 25)
print("Shape of X is {}".format(train_set2['X'].shape))
print(train_set2['y'])

GloVe loaded. Vocabulary size: 1193514
Shape of X is (10, 25)
[1, 0, 0, 0, 1, 1, 0, 1, 0, 0]


## 3. Models

### Logistic regression + character n-grams

To illustrate and test, we use true test dataset as our train dataset, and true trial dataset as our test dataset.

In [2]:
trial_data = load_trial_data()
trial_data = process_trial_data(trial_data)
test_data = load_test_data_a()
test_data = process_test_data(test_data)
# Change to train and test set.
train_data = test_data
test_data = trial_data
print("Train data: {}; Test data: {}.".format(train_data.shape[0], test_data.shape[0]))

number of trial data: 320
Processed data already exists. Direct load it.
number of processed data: 320
number of test data A: 860
Processed data already exists. Direct load it.
number of processed data: 860
Train data: 860; Test data: 320.


Get training X, y, and testing X, y:

In [3]:
train_set_ngram = build_ngrams_dataset(train_data)
train_X = train_set_ngram['X']
train_y = train_set_ngram['y']
vectorizer = train_set_ngram['vectorizer']
print("Shape of train_X: {}".format(train_X.shape))
test_set_ngram = build_ngrams_dataset(test_data, vectorizer=vectorizer)
test_X = test_set_ngram['X']
test_y = test_set_ngram['y']
print("Shape of test_X: {}".format(test_X.shape))

Shape of train_X: (860, 10569)
Shape of test_X: (320, 10569)


Hyperparameter tuning and select best model:

In [None]:
lr_classifier = MLDetector('LR')
params_set = {'penalty': ['l2']}
lr_tune = lr_classifier.hyper_tune(test_X, test_y, params_set, best_only=False)
print('Hyperparameter Tuning: ', lr_tune)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] penalty=l2 ......................................................
[CV] penalty=l2 ......................................................
[CV] penalty=l2 ......................................................
[CV] penalty=l2 ......................................................
[CV] penalty=l2 ......................................................
