In [195]:
import pandas as pd
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
from sklearn.model_selection import train_test_split
import re
import unicodedata

In [196]:
df = pd.read_csv('data/twitter/train.csv')

X_data = df['tweet'].values
y_data = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=42)

df_train = pd.DataFrame({'tweet': X_train, 'label': y_train})
df_test = pd.DataFrame({'tweet': X_test, 'label': y_test})
df_val = pd.read_csv('data/twitter/test.csv')

df_train[df_train['label'] == 1]

Unnamed: 0,tweet,label
6,christmas eve? what about christmas adam? this...,1
15,"@user #allahsoil like all religions, islam can...",1
29,here's a ?for the day the government and state...,1
30,scapelliti: progresverebel: ha! good riddance...,1
34,@user racist attack on three muslim women in #...,1
...,...,...
22291,"#sikh #temple vandalised in in #calgary, #wso ...",1
22292,not even trumpâs transition team wants to de...,1
22310,@user my #santaproject is to not shop at @user...,1
22356,dull british .,1


In [197]:
class TextPreprocessor:
    def __init__(self):
        self.sw = set(get_stop_words("en"))
        self.punctuations = set(punctuation)
        self.tokenization = MorphAnalyzer()

    @staticmethod
    def rm_at_sign(text):
        return re.sub(r'[@]\w+|\w+@[\w.]+', '', text, flags=re.DOTALL)

    @staticmethod
    def rm_hashtags(text):
        return re.sub(r'#\w+', '', text, flags=re.DOTALL)

    @staticmethod
    def rm_num(text):
        return re.sub(r'^\s*\d+\s*|\s*$', '', text)

    @staticmethod
    def rm_brackets(text):
        return re.sub(r'\[|\]', '', text, flags=re.DOTALL)

    @staticmethod
    def rm_links(text):
        return re.sub(r'https?://\S+', '', text, flags=re.DOTALL)


    def preprocess(self, text):
        text = str(text)
        text = text.lower()
        text = self.rm_num(text)
        # text = self.rm_at_sign(text)
        # text = self.rm_hashtags(text)
        # text = self.rm_links(text)
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
        text = ''.join(c for c in text if c not in self.punctuations)
        text = [self.tokenization.parse(word)[0].normal_form for word in text.split() if word not in self.sw]

        return " ".join(text)

preprocessor = TextPreprocessor()

df_train['tweet'] = df_train['tweet'].apply(preprocessor.preprocess)
df_train = df_train[~df_train['tweet'].isin([''])]

df_test['tweet'] = df_test['tweet'].apply(preprocessor.preprocess)

df_train[df_train['label'] == 1].sort_values('tweet').head()

Unnamed: 0,tweet,label
12835,1,1
1025,2016highlights samwu fights various municipali...,1
15976,2017 entering trumpdarkzone trump myoneresolut...,1
11444,299 2016release ebook book summer melted every...,1
15265,2nites church service look user back ur bible ...,1


In [198]:
size_0 = df_train[df_train['label'] == 0].shape[0]
size_1 = df_train[df_train['label'] == 1].shape[0]
print('0:', size_0)
print('1:', size_1)

0: 20811
1: 1558


#### Augmentation

In [199]:
from nlpaug.augmenter.word import SynonymAug

aug = SynonymAug(aug_src='wordnet', aug_p=1.0)

df_train_class1 = df_train[df_train['label'] == 1]

new_aug = []

for text in df_train_class1['tweet']:
    if len(text):
        augmented_text = aug.augment(text)
        new_aug.append(augmented_text[0])

# for text in new_aug:
#     if len(text):
#         augmented_text = aug.augment(text)
#         new_aug.append(augmented_text[0])
#
#         if len(new_aug) >= size_0 - size_1:
#             break

len(new_aug)

1558

#### concatenate df_train + new_tweets

In [200]:
new_tweets = pd.DataFrame({'tweet': new_aug, 'label': [1]*len(new_aug)})
new_tweets.index = range(len(df_train), len(df_train) + len(new_tweets))

df_train = pd.concat([df_train, new_tweets]).reset_index(drop=True)

In [201]:
print(df_train[df_train['label'] == 1].shape[0])
print(df_train[df_train['label'] == 0].shape[0])

3116
20811


#### LSTM

In [202]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from collections import Counter

In [203]:
text_corpus_train = df_train['tweet'].values
text_corpus_test = df_test['tweet'].values

counts = Counter()
for sequence in text_corpus_train:
    counts.update(sequence.split())

print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

num_words before: 36189
num_words after: 14214


In [204]:
class LSTMFixedLen(nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.2)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)

        return self.sigmoid(lstm_out)

lstm_init = LSTMFixedLen(len(vocab2index), 30, 20)
optimizer = torch.optim.Adam(lstm_init.parameters(), lr=0.01)
criterion = nn.BCELoss()

In [205]:
from functools import lru_cache

class TwitterDataset(torch.utils.data.Dataset):

    def __init__(self, txts, labels, w2index, used_length):
        self._txts = txts
        self._labels = labels
        self._length = used_length
        self._w2index = w2index

    def __len__(self):
        return len(self._txts)

    @lru_cache(50000)
    def encode_sentence(self, txt):
        encoded = np.zeros(self._length, dtype=int)
        enc1 = np.array([self._w2index.get(word, self._w2index["UNK"]) for word in txt.split()])
        length = min(self._length, len(enc1))
        encoded[:length] = enc1[:length]
        return encoded, length

    def __getitem__(self, index):
        encoded, length = self.encode_sentence(self._txts[index])
        return torch.from_numpy(encoded.astype(np.int32)), self._labels[index], length

In [206]:
y_train = df_train['label'].values
y_test = df_test['label'].values

train_dataset = TwitterDataset(text_corpus_train, y_train, vocab2index, 27)
test_dataset = TwitterDataset(text_corpus_test, y_test, vocab2index, 27)


In [209]:
train_loader = torch.utils.data.DataLoader(train_dataset,
                          batch_size=128,
                          shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset,
                          batch_size=32,
                          shuffle=False)

In [210]:
from tqdm.notebook import tqdm

for epoch in tqdm(range(10)):
    lstm_init.train()
    for i, data in enumerate(train_loader, 0):
        inputs, labels, lengths = data[0], data[1], data[2]
        inputs = inputs.int()
        labels = labels.float().view(-1, 1)

        optimizer.zero_grad()

        outputs = lstm_init(inputs, lengths)

        last_outputs = outputs[range(outputs.size(0)), lengths-1, -1]
        last_outputs = last_outputs.squeeze(-1)
        labels = labels.float().squeeze(-1)

        loss = criterion(last_outputs, labels)
        loss.backward()
        optimizer.step()

    lstm_init.eval()
    losss = 0
    for X, y, lengths in test_loader:
        X = X.long()
        y = y.long().view(-1, 1)
        output = lstm_init(X, lengths)

        last_output = output[range(output.size(0)), lengths-1, -1]
        last_output = last_output.squeeze(-1)
        y = y.float().squeeze(-1)


        losss = criterion(last_output, y)
        losss = losss.item()
        #loss_accumed += loss
    print("Epoch {} valid_loss {}".format(epoch, losss))

print('Training is finished!')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0 valid_loss 0.4172949492931366
Epoch 1 valid_loss 0.4072824716567993
Epoch 2 valid_loss 0.39986565709114075
Epoch 3 valid_loss 0.3798901438713074
Epoch 4 valid_loss 0.37539777159690857
Epoch 5 valid_loss 0.36526721715927124
Epoch 6 valid_loss 0.36206525564193726
Epoch 7 valid_loss 0.3614402711391449
Epoch 8 valid_loss 0.36335626244544983
Epoch 9 valid_loss 0.3611094057559967
Training is finished!
