In [2]:
import pandas as pd
import os
import re
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from wordsegment import load, segment
import emoji
from preprocessing import process_single_tweet, process_train_data, process_test_data
from utils import load_train_data, load_test_data_a
from feature_embedding import get_all_character_ngrams_of_sentence, build_ngrams_dataset

In [3]:
train_data = load_train_data()
train_data.head()

number of training data: 13240


Unnamed: 0,tweet,subtask_a
0,@USER She should ask a few native Americans wh...,OFF
1,@USER @USER Go home youâ€™re drunk!!! @USER #MAG...,OFF
2,Amazon is investigating Chinese employees who ...,NOT
3,"@USER Someone should'veTaken"" this piece of sh...",OFF
4,@USER @USER Obama wanted liberals &amp; illega...,NOT


## Tokenizer

Result from NLTK twitter tokenizer:

In [4]:
example = train_data["tweet"][1]
print(example)
twt_tokenizer = TweetTokenizer()
tokenized = twt_tokenizer.tokenize(example)
print(tokenized)

@USER @USER Go home youâ€™re drunk!!! @USER #MAGA #Trump2020 ðŸ‘ŠðŸ‡ºðŸ‡¸ðŸ‘Š URL
['@USER', '@USER', 'Go', 'home', 'you', 'â€™', 're', 'drunk', '!', '!', '!', '@USER', '#MAGA', '#Trump2020', 'ðŸ‘Š', 'ðŸ‡º', 'ðŸ‡¸', 'ðŸ‘Š', 'URL']


Result from our tokenizer:

In [5]:
print(example)
print(process_single_tweet(example))

@USER @USER Go home youâ€™re drunk!!! @USER #MAGA #Trump2020 ðŸ‘ŠðŸ‡ºðŸ‡¸ðŸ‘Š URL
go home you are drunk ! ! ! maga trump 2020 oncoming fist united states oncoming fist


## Features

In [6]:
test_data = load_test_data_a()
test_data = process_test_data(test_data)
test_data.head()

number of test data A: 860
Processed data already exists. Direct load it.
number of processed data: 860


Unnamed: 0,tweet,subtask_a
0,who is q wheres the server dump nike dec las f...,OFF
1,constitution day is revered by conservatives h...,NOT
2,foxnews nra maga potus trump 2nd amendment rnc...,NOT
3,watching boomer getting the news that she is s...,NOT
4,no pasaran unity demo to oppose the far right ...,OFF


###  Character n-grams

In [8]:
example = "hhhhh so funny !"
print(example)
get_all_character_ngrams_of_sentence(example)    

hhhhh so funny !


{'!': 1,
 'f': 1,
 'fu': 1,
 'fun': 1,
 'funn': 1,
 'h': 5,
 'hh': 4,
 'hhh': 3,
 'hhhh': 2,
 'n': 2,
 'nn': 1,
 'nny': 1,
 'ny': 1,
 'o': 1,
 's': 1,
 'so': 1,
 'u': 1,
 'un': 1,
 'unn': 1,
 'unny': 1,
 'y': 1}

Build training data based on character n-grams:

In [10]:
sample_train = test_data.head(10)
sample_test = test_data.tail(5)
train_set = build_ngrams_dataset(sample_train)
print(train_set['X'])
print("Shape of X is {}".format(train_set['X'].shape))
print(train_set['y'])

[[3. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
Shape of X is (10, 976)
[1, 0, 0, 0, 1, 1, 0, 1, 0, 0]


Build test data using the same vectorizer when building training data:

In [11]:
vectorizer = train_set['vectorizer']
test_set = build_ngrams_dataset(sample_test, vectorizer=vectorizer)
print(test_set['X'])
print(test_set['y'])

[[0. 1. 0. ... 0. 0. 0.]
 [0. 2. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 1.]
 [0. 1. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[1, 0, 1, 0, 0]
