### NITRO NLP

##### BOGDAN NLP

### Imports

In [1]:
import json
import torch
import numpy as np
import pickle
# from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm
import pandas as pd
import random
import os
import re
import emoji

  from .autonotebook import tqdm as notebook_tqdm


### Seeder

In [2]:
seed = 10 # Bogdan NLP
os.environ["CUBLAS_WORKSPACE_CONFIG"]=":4096:8"
np.random.seed(seed)
np.random.RandomState(seed)

random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.use_deterministic_algorithms(True)

### Constants

In [3]:
NUM_CLASSES = 5
MODEL_NAME = 'readerbench/RoBERT-base'

PATH_MODELS = "./Models/"
EPOCHS = 3
LR = 5e-6
BATCH_SIZE = 8

In [4]:
label_to_id = {
    "direct" : 0,
    "descriptive" : 1,
    "reporting" : 2,
    "offensive" : 3,
    "non-offensive" : 4
}


id_to_label = {
    0 : "direct",
    1 : "descriptive",
    2 : "reporting",
    3 : "offensive",
    4 : "non-offensive"
}

### Read data

In [None]:
! pip install spacy
! python -m spacy download ro_core_news_lg
import spacy
nlp = spacy.load('ro_core_news_lg')
stopwords = nlp.Defaults.stop_words

In [5]:
def parse_string(s):
    s = " " + s + " "
    s = s.replace("ţ", "ț").replace("ş", "ș").replace("Ţ", "Ț").replace("Ş", "Ș")
    s = s.lower()
    s = re.sub(r"http\S+ ", " ", s)
    s = re.sub(r"@\S+ ", " ", s)
    s = re.sub(r"#\S+ ", " ", s)
    # s = re.sub(r"#", " ", s)
    s = re.sub(r"\"", " ", s)
    s = re.sub(r"„", " ", s)
    s = re.sub(r"”", " ", s)
    
    s = re.sub(r"http\S+ ", " ", s)
    s = re.sub(r"@\S+ ", " ", s)
    s = re.sub(r'#[a-zA-Z0-9_]+',' ', s)
    s = re.sub(r'@[a-zA-Z0-9_]+',' ', s)
    emoticon_string = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      </?3                       # heart
    )"""
    
    emoticon_re = re.compile(emoticon_string, re.VERBOSE | re.I | re.UNICODE)
    s = re.sub(emoticon_re, ' ', s) 
    s = re.sub(r"\"", " ", s)
    s = re.sub(r"„", " ", s)
    s = re.sub(r"”", " ", s)
    emojis = emoji.emoji_list(s)
    s_no_emoji = s
    for emoji_dict in emojis:
        emoji_str = emoji_dict.get("emoji")
        s_no_emoji = re.sub(emoji_str, ' ', s_no_emoji)
    s = s_no_emoji
    s = re.sub(r"\d+"," ",s)
    s = re.sub(r'[^\w\s]', '', s)
    
    for stopword in stopwords:
        s = s.replace(f" {stopword} ", " ")
    
    
    s = re.sub(r"\s+", " ", s)
    s = s.strip()
    return s

In [6]:
test = "\"Alexandra 👦🌷👱‍♀️ ”” „  ăîțșșȘȚĂÎÂ poartă costumul și #băieţii #feat #liviuvârciu @New Yorker de baie accesorizat manual cu pietre disponibil și pe Boutique. Pentru mai multe modele și comenzi va asteptam în pagina de facebook Atelier Beatrice si… https://www.instagram.com/p/Bv1G35hF6Wp/?utm_source=ig_twitter_share&amp;igshid=1u61amzelbgnl"

print(parse_string(test))

alexandra ăîțșșșțăîâ poartă costumul și băieții feat liviuvârciu yorker de baie accesorizat manual cu pietre disponibil și pe boutique. pentru mai multe modele și comenzi va asteptam în pagina de facebook atelier beatrice si…


In [7]:
df = pd.read_csv('./Nitro_NLP_Dataset/train_data.csv')
nitro_dataset = np.array([[parse_string(str(df['Text'][i])), label_to_id[df['Final Labels'][i]]] for i in range(len(df))], dtype = object)

In [8]:
for key, value in id_to_label.items():
    print(f"Pentru label-ul {value} avem {len([x for x in nitro_dataset if x[1] == key])}")

Pentru label-ul direct avem 2156
Pentru label-ul descriptive avem 1494
Pentru label-ul reporting avem 219
Pentru label-ul offensive avem 4301
Pentru label-ul non-offensive avem 30838


In [9]:
nitro_dataset[25]

array(['nu au pus dansul cu doar interviu cu niste fete de la univ romano-americana că au deschis korean corner',
       4], dtype=object)

In [10]:
train_data, valid_data = nitro_dataset, []

print(len(train_data),len(valid_data))

39008 0


### Dataset class

In [11]:
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, tokenizer_kwargs={'tokenizers': 'whitespace'})
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, strip_accents=False)
class Dataset(torch.utils.data.Dataset):

    def __init__(self, data):

        self.labels = [int(x[1]) for x in data]
        self.texts = [tokenizer(x[0], 
                                padding='max_length', max_length = 512,
                                truncation=True,
                                return_tensors="pt") for x in data]
        
    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

### Classifier class

In [12]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = AutoModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, NUM_CLASSES) # CHANGE TO NUM_CLASSES
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

### TRAIN

In [13]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # '''
    # WEIGHTS
    weights = []
    
    unique, counts = np.unique(np.array([x for x in train_data[:,1]]), return_counts=True)
    
    d = dict(zip(unique, counts))
    
#     for i in range(num_classes):
#         weights.append(1.0/d[i])
        
    sum = len(train_data)
    
    for i in range(NUM_CLASSES):
        weights.append((sum/(d[i]*NUM_CLASSES)))
    
    print(weights)
    
    class_weights = torch.FloatTensor(weights).cuda()
    # END WEIGHTS
    # '''
    
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    # criterion = nn.CrossEntropyLoss()
    
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                #print(f"Avem input {train_input} si label {train_label}")
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
#             print(
#                 f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
#                 | Train Accuracy: {total_acc_train / len(train_data): .3f} \
#                 | Val Loss: {total_loss_val / len(val_data): .3f} \
#                 | Val Accuracy: {total_acc_val / len(val_data): .3f}')
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f}')
        
            with open(f"{PATH_MODELS}{MODEL_NAME}_NO_STOPWORDS_{epochs}_{epoch_num+1}.pickle", "wb") as fp:
                pickle.dump(model.state_dict(), fp)

In [14]:
# model = pickle.load(open('model.pkl', 'rb'))
model = BertClassifier()

# with open("./Models/dumitrescustefan/bert-base-romanian-uncased-v1_WEIGHTED_3_3.pickle", "rb") as fp:
#          model.load_state_dict(pickle.load(fp))

Some weights of the model checkpoint at dumitrescustefan/bert-base-romanian-uncased-v1 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
train(model, train_data, valid_data, LR, EPOCHS)

[3.618552875695733, 5.221954484605087, 35.62374429223744, 1.813903743315508, 0.25298657500486416]


100%|██████████████████████████████████████████████████████████████████████████████| 4876/4876 [24:32<00:00,  3.31it/s]


Epochs: 1 | Train Loss:  0.040                 | Train Accuracy:  0.893


100%|██████████████████████████████████████████████████████████████████████████████| 4876/4876 [24:30<00:00,  3.31it/s]


Epochs: 2 | Train Loss:  0.034                 | Train Accuracy:  0.905


100%|██████████████████████████████████████████████████████████████████████████████| 4876/4876 [24:33<00:00,  3.31it/s]


Epochs: 3 | Train Loss:  0.029                 | Train Accuracy:  0.919


### EVALUATE

In [16]:
df = pd.read_csv('./Nitro_NLP_Dataset/test_data_diacritice.csv')
test_data = np.array([[parse_string(str(df['Text'][i])), -1] for i in range(len(df))], dtype = object)
ids = df['Id']

In [17]:
def evaluate(model, test_data):
    test = Dataset(test_data)
    labels = []
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    with torch.no_grad():

        for test_input, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            for x in output.argmax(dim = 1):
                labels.append(x)

    return labels

In [18]:
labels = evaluate(model, test_data)

In [19]:
from datetime import datetime
g = open(f"./Nitro_NLP_Submissions/submisie_23.csv", "w")
g.write("Id,Label\n")
for i in range(len(labels)):
    g.write(f"{ids[i]},{id_to_label[int(labels[i])]}\n")
g.close()
    

In [20]:
print(len(labels), len(test_data))

3130 3130
