In [None]:
# ! pip install -qq transformers 
# ! pip install -qq fasttext 

import numpy as np
import pandas as pd
import random
import os
import nltk.data 
import gensim
from nltk.corpus import stopwords
import re
from torch.optim import SGD, Adam
from warnings import filterwarnings
import torch.nn.functional as F
from tqdm import tqdm_notebook as tqdm
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from nltk.tokenize import ToktokTokenizer
import fasttext 
import torch
import torch.nn as nn

filterwarnings('ignore')
nltk.download('stopwords')

In [None]:
def seed_everything(seed: int):
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(0)

In [None]:
class cfg():
    data_dir =  './data/'
    embedding_dim = 200
    n_filtters = 100
    filter_sizes = [2,4,7]
    output_dim = 1
    dropout_prob = 0.3
    num_epochs = 5
    batch_size = 32
    downsample = True
    

In [None]:

train_df = pd.read_csv(cfg.data_dir + 'train.tsv', sep='\t')
test_df = pd.read_csv(cfg.data_dir + 'test.tsv', sep='\t')
valid_df = pd.read_csv(cfg.data_dir + 'valid.tsv', sep='\t')

### Preprocessing
Props to: <br>
https://github.com/akutuzov/webvectors/blob/master/preprocessing/modular_processing/unify.py

In [None]:
def list_replace(search, replacement, text):
    '''
    Replaces all symbols of text which are present
    in the search string with the replacement string.
    '''
    search = [el for el in search if el in text]
    for c in search:
        text = text.replace(c, replacement)
    return text

def clean_text(text):

    text = list_replace(
        '\u00AB\u00BB\u2039\u203A\u201E\u201A\u201C\u201F\u2018\u201B\u201D\u2019',
         '\u0022',
          text
    )

    text = list_replace(
        '\u2012\u2013\u2014\u2015\u203E\u0305\u00AF',
         '\u2003\u002D\u002D\u2003',
          text
    )

    text = list_replace(
        '\u2010\u2011',
         '\u002D',
          text
    )

    text = list_replace(
        '\u2000\u2001\u2002\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u2060\u3000',
        '\u2002',
        text
    )

    text = re.sub('\u2003\u2003', '\u2003', text)
    text = re.sub('\t\t', '\t', text)

    text = list_replace(
        '\u02CC\u0307\u0323\u2022\u2023\u2043\u204C\u204D\u2219\u25E6\u00B7\u00D7\u22C5\u2219\u2062',
        '.',
         text
    )

    text = list_replace('\u2217', '\u002A', text)

    text = list_replace('…', '...', text)

    text = list_replace('\u00C4', 'A', text)
    text = list_replace('\u00E4', 'a', text)
    text = list_replace('\u00CB', 'E', text)
    text = list_replace('\u00EB', 'e', text)
    text = list_replace('\u1E26', 'H', text)
    text = list_replace('\u1E27', 'h', text)
    text = list_replace('\u00CF', 'I', text)
    text = list_replace('\u00EF', 'i', text)
    text = list_replace('\u00D6', 'O', text)
    text = list_replace('\u00F6', 'o', text)
    text = list_replace('\u00DC', 'U', text)
    text = list_replace('\u00FC', 'u', text)
    text = list_replace('\u0178', 'Y', text)
    text = list_replace('\u00FF', 'y', text)
    text = list_replace('\u00DF', 's', text)
    text = list_replace('\u1E9E', 'S', text)
    # Removing punctuation
    text = list_replace(',.[]{}()=+-−*&^%$#@!~;:§/\|\?\'\n', ' ', text)
    # Replacing all numbers with masks
    text = list_replace('0123456789', 'x', text)

    currencies = list(
            '\u20BD\u0024\u00A3\u20A4\u20AC\u20AA\u2133\u20BE\u00A2\u058F\u0BF9\u20BC\u20A1\u20A0\u20B4\u20A7\u20B0\u20BF\u20A3\u060B\u0E3F\u20A9\u20B4\u20B2\u0192\u20AB\u00A5\u20AD\u20A1\u20BA\u20A6\u20B1\uFDFC\u17DB\u20B9\u20A8\u20B5\u09F3\u20B8\u20AE\u0192'
    )

    alphabet = list(
        '\t\r абвгдеёзжийклмнопрстуфхцчшщьыъэюяАБВГДЕЁЗЖИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ '
    )

    allowed = set(currencies + alphabet)

    cleaned_text = [sym for sym in text if sym in allowed]
    cleaned_text = ''.join(cleaned_text)

    return cleaned_text

In [None]:
# Extracting tweet labels
train_labels = train_df['label'].values
valid_labels = valid_df['label'].values

def process(df):
    tweets = df.tweet.values
    return " ".join([clean_text(tweet).lower() for tweet in tweets])

train_df['clean_text'] = process(train_df)
valid_df['clean_text'] = process(valid_df)
test_df['clean_text'] = process(test_df)

In [None]:
train_positive_class_df = train_df[train_df['label'] == 1]
train_negative_class_df = train_df[train_df['label'] == 0]

num_positive_examples = len(train_positive_class_df)
num_negative_examples = len(train_negative_class_df)

if cfg.downsample:
    train_positive_class_df = train_positive_class_df.sample(num_negative_examples,
                                                            replace=True)
    train_df = pd.concat((train_positive_class_df, train_negative_class_df)).sample(frac=1)

In [None]:
tokenizer = ToktokTokenizer()

def preproc(text):
    text = tokenizer.tokenize(text)
    return [word for word in text if word not in stopwords.words('russian') + [' ', '\n']]
    
def pad_to_max_len(tweet, max_len):
    tweet_len = len(tweet)
    padding = ["<PAD>"]*(max_len - tweet_len)
    tweet += padding
    return tweet

def get_padded_data(texts, vocab, max_len=70):

    word2idx = vocab.word_to_idx
    pad = pad_sequence([torch.as_tensor([word2idx[w] if w in vocab.vocab else word2idx['<UNK>']
                                                   for w in seq][:max_len]) for seq in texts], 
                               batch_first=True)

    return pad


In [None]:

class Vocab():
        def __init__(self, df):
            self.tweets = df.clean_text.values
            self.vocab = {}
            self.vocab_size = 0
            self.idx_to_word = {}
            self.word_to_idx = {}
    
        def build_vocab(self, max_size = None):
            all_text = preproc(" ".join(self.tweets))
            self.vocab = set(all_text)
            self.vocab_size = len(self.vocab)
            counter = {word: 1 for word in self.vocab}
            for word in all_text:
                counter[word] += 1
            freqs = {word: freq for word, freq in sorted(counter.items(), key=lambda freq: -freq[1])}
            
            # limit max_size
            if max_size:
                new_vocab = {}
                for i, word in enumerate(freqs):
                    new_vocab.add(word)
                    if i == max_size-1:
                        break
                        
                self.vocab = new_vocab
                    
            self.idx_to_word = {idx: word for idx,word in enumerate(self.vocab,1)}
            self.idx_to_word[0] = '<UNK>'
            self.word_to_idx = {word: idx for idx,word in enumerate(self.vocab,1)}
            self.word_to_idx['<UNK>'] = 0
    
        
vocabulary = Vocab(train_df)
vocabulary.build_vocab()
        

class TwitterDataset(Dataset):
        def __init__(self, tweets, targets, vocab, tokenizer, max_len):
            self.targets = targets
            self.tokenizer = tokenizer
            self.max_len = max_len
            self.vocab = vocab
            self.tweets =  get_padded_data(
                    tweets.apply(preproc),
                    self.vocab,
                    self.max_len
                )

        def __len__(self):
              return len(self.tweets)

        def __getitem__(self, item):
            text = self.tweets[item]
            target = self.targets[item]
            return text, torch.tensor(target, dtype=torch.long)

    
tokenizer = ToktokTokenizer()

train_tokenized = [tokenizer.tokenize(x) for x in train_df.tweet]
valid_tokenized = [tokenizer.tokenize(x) for x in valid_df.tweet]
test_tokenized = [tokenizer.tokenize(x) for x in test_df.tweet]

train_max_len = max(map(len, train_tokenized))
valid_max_len = max(map(len, valid_tokenized))
test_max_len = max(map(len, valid_tokenized))

print(train_max_len)
print(valid_max_len)
print(test_max_len)

    
def create_data_loader(df, vocab, tokenizer, batch_size, max_len, target=True):
        if 'label' in df:
            labels = df.label.values
        else:
            labels = [0] * len(df)
        ds = TwitterDataset(
            tweets= df.clean_text,
            targets=labels,
            vocab = vocab,
            tokenizer=tokenizer,
            max_len=max_len,
        )
        return DataLoader(
            ds,
            batch_size=batch_size
        )

In [None]:
fasttext_model = fasttext.load_model('rudrec_fasttext_model.bin')
embedding_matrix = np.zeros((vocabulary.vocab_size + 2, 200))

for i, word in enumerate(vocabulary.word_to_idx,1):
    embedding_vector = fasttext_model.get_word_vector((word))
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector

embedding_matrix = torch.Tensor(embedding_matrix)

In [None]:
class CNN_classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters,filter_sizes, output_dim, dropout_prob):
        super(CNN_classifier, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout_prob)
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1))
        return self.fc(cat)

In [None]:
BATCH_SIZE = cfg.batch_size
train_dataloader = create_data_loader(train_df, vocabulary, tokenizer, BATCH_SIZE, train_max_len)
valid_dataloader = create_data_loader(valid_df, vocabulary, tokenizer, BATCH_SIZE, valid_max_len)
test_datloader = create_data_loader(test_df, vocabulary, tokenizer, BATCH_SIZE, test_max_len)

In [None]:
vocab_size = vocabulary.vocab_size + 1
device = 'cpu'
model = CNN_classifier(vocab_size,
                      cfg.embedding_dim,
                      cfg.n_filters,
                      cfg.filter_sizes,
                      cfg.output_dim,
                      cfg.dropout_prob
                      )
model.to(device)

N_EPOCHS = cfg.num_epochs

optimizer = Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss() 

In [None]:
from tqdm import tqdm
import torch.nn.functional as F

def binary_accuracy(preds, y):
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    
    acc_hist =  []
    loss_hist = []
    
    epoch_loss = 0
    epoch_acc = 0
    
    for text,label in dataloader:
        optimizer.zero_grad()
        pred = model(text.to(device)).squeeze(1)
        loss = criterion(pred.float(), label.float().to(device))

        acc = binary_accuracy(pred.float(), label.float().to(device))
        loss.backward()
        optimizer.step()
        epoch_loss += loss
        epoch_acc += acc
        acc_hist.append(acc)
        loss_hist.append(loss)
        
    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)
        
        
def evaluate(model, eval_dataloader, criterion):
    model.eval()
    
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
        for text,label in eval_dataloader:
            pred = model(text.to(device)).squeeze(1)
            loss = criterion(pred.float(), label.float().to(device))
            acc = binary_accuracy(pred.float(), label.float().to(device))
            epoch_loss += loss
            epoch_acc += acc
        
    return epoch_loss / len(eval_dataloader), epoch_acc / len(eval_dataloader)

In [None]:
for epoch in tqdm(range(N_EPOCHS)):
    train_loss, train_acc = train_epoch(model, train_dataloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_dataloader, criterion)
    
    print(f'Epoch: {epoch+1}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%,\
    Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

In [None]:
predictions = []
for batch in test_datloader:
    text, _ = batch
    text = text.to(device)
    predictions.append(model(text))
    
predictions = torch.sigmoid(torch.cat(predictions)).detach().numpy()

In [None]:
df_submit = pd.DataFrame(columns=['tweet_id', 'label'])

df_submit['tweet_id'] = test_df['tweet_id'].values
df_submit['label'] = predictions

df_submit.to_csv('solution.csv', sep=',', index=False)