### Imports

In [None]:
import re
from warnings import filterwarnings
import numpy as np
import pandas as pd
import os
import random
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
from torch.utils.data import Dataset 
from transformers import AutoModel, AdamW, AutoTokenizer, get_linear_schedule_with_warmup

filterwarnings('ignore')

In [None]:
def seed_everything(seed: int):
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(0)

In [None]:
class cfg:
    data_dir = './data/'
    num_epochs = 2
    learning_rate = 2e-5
    batch_size = 32
    model_name = "sberbank-ai/sbert_large_mt_nlu_ru"
    dropout_prob = 0.3
    downsample = True
    


### Balancing data

In [None]:
train_df = pd.read_csv(cfg.data_dir +'train.tsv', sep='\t')
test_df = pd.read_csv(cfg.data_dir + 'test.tsv', sep='\t')
valid_df = pd.read_csv(cfg.data_dir + 'valid.tsv', sep='\t')

train_positive_class_df = train_df[train_df['label'] == 1]
train_negative_class_df = train_df[train_df['label'] == 0]

len(train_positive_class_df), len(train_negative_class_df)

In [None]:
num_positive_examples = len(train_positive_class_df)
num_negative_examples = len(train_negative_class_df)
if cfg.downsample:
    train_positive_class_df = train_positive_class_df.sample(num_negative_examples,
                                                            replace=True)
    train_df = pd.concat((train_positive_class_df, train_negative_class_df)).sample(frac=1)

### Preprocessing
Props to: <br>
https://github.com/akutuzov/webvectors/blob/master/preprocessing/modular_processing/unify.py


In [None]:
def list_replace(search, replacement, text):
    '''
    Replaces all symbols of text which are present
    in the search string with the replacement string.
    '''
    search = [el for el in search if el in text]
    for c in search:
        text = text.replace(c, replacement)
    return text

def clean_text(text):

    text = list_replace(
        '\u00AB\u00BB\u2039\u203A\u201E\u201A\u201C\u201F\u2018\u201B\u201D\u2019',
         '\u0022',
          text
    )

    text = list_replace(
        '\u2012\u2013\u2014\u2015\u203E\u0305\u00AF',
         '\u2003\u002D\u002D\u2003',
          text
    )

    text = list_replace(
        '\u2010\u2011',
         '\u002D',
          text
    )

    text = list_replace(
        '\u2000\u2001\u2002\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u2060\u3000',
        '\u2002',
        text
    )

    text = re.sub('\u2003\u2003', '\u2003', text)
    text = re.sub('\t\t', '\t', text)

    text = list_replace(
        '\u02CC\u0307\u0323\u2022\u2023\u2043\u204C\u204D\u2219\u25E6\u00B7\u00D7\u22C5\u2219\u2062',
        '.',
         text
    )

    text = list_replace('\u2217', '\u002A', text)

    text = list_replace('…', '...', text)

    text = list_replace('\u00C4', 'A', text)
    text = list_replace('\u00E4', 'a', text)
    text = list_replace('\u00CB', 'E', text)
    text = list_replace('\u00EB', 'e', text)
    text = list_replace('\u1E26', 'H', text)
    text = list_replace('\u1E27', 'h', text)
    text = list_replace('\u00CF', 'I', text)
    text = list_replace('\u00EF', 'i', text)
    text = list_replace('\u00D6', 'O', text)
    text = list_replace('\u00F6', 'o', text)
    text = list_replace('\u00DC', 'U', text)
    text = list_replace('\u00FC', 'u', text)
    text = list_replace('\u0178', 'Y', text)
    text = list_replace('\u00FF', 'y', text)
    text = list_replace('\u00DF', 's', text)
    text = list_replace('\u1E9E', 'S', text)
    # Removing punctuation
    text = list_replace(',.[]{}()=+-−*&^%$#@!~;:§/\|\?\'\n', ' ', text)
    # Replacing all numbers with masks
    text = list_replace('0123456789', 'x', text)

    currencies = list(
            '\u20BD\u0024\u00A3\u20A4\u20AC\u20AA\u2133\u20BE\u00A2\u058F\u0BF9\u20BC\u20A1\u20A0\u20B4\u20A7\u20B0\u20BF\u20A3\u060B\u0E3F\u20A9\u20B4\u20B2\u0192\u20AB\u00A5\u20AD\u20A1\u20BA\u20A6\u20B1\uFDFC\u17DB\u20B9\u20A8\u20B5\u09F3\u20B8\u20AE\u0192'
    )

    alphabet = list(
        '\t\r абвгдеёзжийклмнопрстуфхцчшщьыъэюяАБВГДЕЁЗЖИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ '
    )

    allowed = set(currencies + alphabet)

    cleaned_text = [sym for sym in text if sym in allowed]
    cleaned_text = ''.join(cleaned_text)

    return cleaned_text

In [None]:
def preproc(df):
    tweets = df.tweet.values
    return " ".join([clean_text(tweet).lower() for tweet in tweets])

train_df['clean_text'] = preproc(train_df)
valid_df['clean_text'] = preproc(valid_df)
test_df['clean_text'] = preproc(test_df)

### Model and dataset

In [None]:
class TwitterClassifier(nn.Module):
    def __init__(self, n_classes):
        super(TwitterClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(cfg.model_name)
        self.drop = nn.Dropout(p=cfg.dropout_prob)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state_cls = outputs[0][:, 0, :]

        return self.out(
                    self.drop(
                        last_hidden_state_cls
                        )
        )

class TwitterDataset(Dataset):
    def __init__(self, ids, tweets, targets, tokenizer, max_len):
        self.ids = ids
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        target = self.targets[item]
        id = self.ids[item]
        
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        return {
            'id': id,
            'tweet_text': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

train_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in train_df.tweet]
valid_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in valid_df.tweet]
test_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in test_df.tweet]

train_max_len = max(map(len, train_tokenized))
valid_max_len = max(map(len, valid_tokenized))
test_max_len = max(map(len, valid_tokenized))

print(train_max_len)
print(valid_max_len)
print(test_max_len)

In [None]:
def create_data_loader(df, tokenizer, batch_size, max_len):
    if 'label' in df:
        labels = df.label.values
    else:
        labels = [0] * len(df)
    ds = TwitterDataset(
        ids = df.tweet_id.values,
        tweets= df.clean_text.values,
        targets=labels,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        ds,
        batch_size=batch_size,
    )

BATCH_SIZE = cfg.batch_size

label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df["label"])

train_data_loader = create_data_loader(train_df, tokenizer, BATCH_SIZE, train_max_len)
valid_data_loader = create_data_loader(valid_df, tokenizer, BATCH_SIZE, valid_max_len)
test_data_loader = create_data_loader(test_df, tokenizer, BATCH_SIZE, test_max_len)


### Training

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TwitterClassifier(2)
model = model.to(device)

In [None]:
EPOCHS = cfg.num_epochs
optimizer = AdamW(model.parameters(), lr=cfg.learning_rate, correct_bias=True)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=cfg.warmup_steps,
    num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for batch in tqdm(data_loader, desc='TRAIN'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
@torch.no_grad()
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    for d in tqdm(data_loader, desc='EVALUATION'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
#gc.collect()
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1:2d}/{EPOCHS:2d}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(train_df))
    valid_acc, valid_loss = eval_model(model, valid_data_loader, loss_fn, device, len(valid_df))
    
    print(f'Train loss {train_loss:.4f} accuracy {train_acc:.4f}')
    print(f'Valid loss {valid_loss:.4f} accuracy {valid_acc:.4f}')

### Predicting labels for valid and test sets:

In [None]:
@torch.no_grad()
def get_predictions(model, data_loader):
    model.eval()
    
    predictions = []
    prediction_probs = []
    real_values = []
    
    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        outputs = F.softmax(model(
            input_ids=input_ids,
            attention_mask=attention_mask
        ))
        _, preds = torch.max(outputs, dim=1)
        predictions.extend(preds)
        prediction_probs.extend(outputs)
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    return predictions, prediction_probs

In [None]:
predicted_valid_labels, prediction_probs_valid = get_predictions(model, valid_data_loader)
predicted_test_labels, prediction_probs_test = get_predictions(model, test_data_loader)

### Calculating metrics

In [None]:
valid_labels = valid_df['label']
valid_precision = precision_score(valid_labels, predicted_valid_labels)
valid_recall = recall_score(valid_labels, predicted_valid_labels)
valid_f_measure = f1_score(valid_labels, predicted_valid_labels)
valid_roc_auc = roc_auc_score(valid_labels, [x[1] for x in prediction_probs_valid])

print('Validation dataset')

print(f'Precision: {valid_precision:.4f}')
print(f'Recall: {valid_recall:.4f}')
print(f'F-measure: {valid_f_measure:.4f}')
print(f'ROC_AUC: {valid_roc_auc:.4f}')

### Creating submission file

In [None]:
df_submit = pd.DataFrame(columns=['tweet_id', 'label'])

df_submit['tweet_id'] = test_df['tweet_id'].values
df_submit['label'] = [x[1].item() for x in prediction_probs_test]

df_submit.to_csv('convbert_up_solution.csv', sep=',', index=False)

In [None]:
torch.save(model.state_dict(),'model.pkl')