# Setup

In [23]:
!pip -q install nltk==3.2.4
!pip -q install spacy_udpipe==1.0.0
!pip -q install tqdm==4.66.1
!pip -q install gensim==4.3.2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [24]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
import random
import gensim
import spacy_udpipe
import re
from tqdm.notebook import tqdm_notebook as tqdm

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Augmentation
## DDA Functions

In [25]:
# RANDOM SWAP ---------------------------------------------------
def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return new_words

def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words) - 1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words) - 1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words

# RANDOM INSERTION ---------------------------------------------------
def random_insertion(words, n):
    new_words = words.copy()
    for _ in range(n):
            add_word(new_words)
    return new_words

def add_word(new_words):
    synonyms = []
    counter = 0

    while len(synonyms) <1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        #synonyms = self.synonyms_cadidates(random_word, self.df)
        synonyms = list(get_synonyms_vec(random_word))
        counter += 1
        if counter > 10:
            return

    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

# RANDOM DELETION ---------------------------------------------------
def random_deletion(words, p):
    """
    Randomly delete words from a sentence with probability p
    :param words:
    :param p:
    :return:
    """
    if len(words) == 1:
        return words
    new_words = []
    for word in words:
        r = random.uniform(0, 1) # random number between 0.0 and 1.0
        if r > p: #kinda elegant when you think about it
            new_words.append(word)
    #if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    return new_words

# VECTOR-BASED SYNONYM REPLACEMENT ----------------------------------
stop_words = list(set(nltk.corpus.stopwords.words('english')))
def synonym_replacement_vec(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms_vec(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word.lower() == random_word else word for word in new_words]
            # print("replaced", random_word, "with", synonym)
            num_replaced += 1
        if num_replaced >= n:  # only replace up to n words
            break
    # this is stupid but we need it, trust me
    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')
    return new_words

def get_synonyms_vec(word):
    synonyms = set()
    flag = False
    vec = None
    try:
        vec = wv_from_text.similar_by_word(word.lower())
    except KeyError:
        flag = True
        pass
    if flag is False:
        synonyms.add(vec[0][0])
    if word in synonyms:
        synonyms.remove(word)
    return synonyms

In [26]:
def augmentation(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, alpha_rd=0.1, num_aug=9):
    """
    @param sentence
    @param alpha_sr synonym replacement rate, percentage of the total sentence
    @param alpha_ri random insertion rate, percentage of the total sentence
    @param alpha_rs random swap rate, percentage of the total sentence
    @param alpha_rd random deletion rate, percentage of the total sentence
    @param num_aug how many augmented sentences to create

    @return list of augmented sentences
    """
    words_list = sentence.split(' ')  # list of words in the sentence
    words = [word for word in words_list if word != '']  # remove empty words
    num_words = len(words_list)  # number of words in the sentence

    augmented_sentences = []
    num_new_per_technique = int(num_aug / 4) + 1 # number of augmented sentences per technique
 
    #synonmym replacement
    if (alpha_sr > 0):
        n_sr = max(1, int(alpha_sr * num_words)) # number of words to be replaced per technique
        #print("Number of words to be replaced per technique: ", n_sr)
        for _ in range(num_new_per_technique):
            a_words = synonym_replacement_vec(words, n_sr)
            augmented_sentences.append(' '.join(a_words))
    #random insertion
    if (alpha_ri > 0):
        n_ri = max(1,int(alpha_ri * num_words))
        for _ in range(num_new_per_technique):
            a_words = random_insertion(words, n_ri)
            augmented_sentences.append(' '.join(a_words))
    #Random Deletion
    if (alpha_rd > 0):
        for _ in range(num_new_per_technique):
            a_words = random_deletion(words, alpha_rd)
            augmented_sentences.append(' '.join(a_words))
    #Random Swap
    if (alpha_rs > 0):
        n_rs = max(1, int(alpha_rs * num_words))
        for _ in range(num_new_per_technique):
            a_words = random_swap(words, n_rs)
            augmented_sentences.append(' '.join(a_words))

    return augmented_sentences

In [27]:
def create_encodings(df, name):
    texts = [gensim.utils.simple_preprocess(text) for text in df.text]
    words = [word.encode('utf-8') for words in texts for word in words]
    w2v_model = gensim.models.Word2Vec(texts, min_count=1, vector_size=300) # Train Word2Vec model
    filename = f'word2vec_{name}_300dim.txt'
    w2v_model.wv.save_word2vec_format(filename, binary=False) # Save Model 
    return filename
    
def create_augmented_df(df, val_df, sr, ri, rs, rd, n, name):
    aug_data = {0:[],1:[],2:[],3:[],4:[],5:[],6:[]}
    n_sentences = df.shape[0]
    tqdm.pandas(desc="Augmentation Progress ")
    df['augmented'] = df.text.progress_apply(lambda x: augmentation(x, 
                                                           alpha_sr=0.3, alpha_ri=0.2, 
                                                           alpha_rs=0.4, alpha_rd=0.3, 
                                                           num_aug=4))
    aug_df = df[['augmented','label']].rename(columns={'augmented':'text'})
    aug_df = aug_df.explode('text', ignore_index=True)
    aug_df.to_csv(f"{name}_aug_{sr}_{ri}_{rs}_{rd}_{n}.csv")
    val_df.to_csv(f"{name}_aug_{sr}_{ri}_{rs}_{rd}_{n}_test.csv")
    return aug_df

def sample_df(df, n_rows=None):
    n_rows = df.label.value_counts().min() if n_rows == None else n_rows
    def sample_rows(group, x):
        if group.shape[0] > x:
            return group.sample(x)
        else:
            return group
    sampled_df = df.groupby('label')[['label','text']].apply(lambda group: sample_rows(group, n_rows)).reset_index(drop=True)
    sampled_df = sampled_df.sample(frac=1).reset_index(drop=True)
    return sampled_df

# Preprocess Data

In [37]:
# DATA_PATH = "/kaggle/input/goemotions-7-emotions/goemotions.csv"
DATA_PATH = "/kaggle/input/emotions/text.csv"

def get_filename(path):
    return path.split('/')[-1].split('.')[0]

In [40]:
import pandas as pd

data = pd.read_csv(DATA_PATH)
print(f"Original dataset: {data.shape}")

sampled_data = sample_df(data, 1000)
sampled_data.to_csv(f'{get_filename(DATA_PATH)}_sampled_{sampled_data.shape[0]}.csv')
print(f"Sampled dataset: {sampled_data.shape}")
enc_filename = create_encodings(data, get_filename(DATA_PATH)) # Create Word2Vec embeddings
wv_from_text = gensim.models.KeyedVectors.load_word2vec_format(enc_filename, 
                                                               binary=False)
print("Word2Vec loaded from:", enc_filename)

Original dataset: (416809, 3)
Sampled dataset: (6000, 2)
Word2Vec loaded from: word2vec_text_300dim.txt


In [41]:
from sklearn.model_selection import train_test_split
y = list(sampled_data.label)
train_X, val_X, train_y, val_y = train_test_split(sampled_data.text, 
                                                  sampled_data.label, 
                                                  test_size=0.2, random_state=42, stratify=y)

train_data = pd.DataFrame({'text':train_X, 'label':train_y})
val_data = pd.DataFrame({'text':val_X, 'label':val_y}).reset_index()

print(f"Training data: {train_data.shape}")

train_data_augmented = create_augmented_df(train_data, val_data,
                                        .3, .2, .4, .3, , 
                                        name=get_filename(DATA_PATH))
print(f"Augmented Training data: {train_data_augmented.shape}")

Original data: (4800, 2)


Augmentation Progress :   0%|          | 0/4800 [00:00<?, ?it/s]

Augmented data: (38400, 2)


# Fine-tuning BERT

In [50]:
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import torch
from torch.nn.parallel import DataParallel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm_notebook as tqdm

def split_data(df):
    y = list(df.label)
    train_X, val_X, train_y, val_y = train_test_split(df.text, df.label, 
                                                      test_size=0.2, random_state=42, 
                                                      stratify=y)
    return train_X, val_X, train_y, val_y

def create_dataloader(tokenizer, 
                      train_X=None, val_X=None, 
                      train_y=None, val_y=None):
    train_X = train_X.tolist()
    val_X = val_X.tolist()
    train_y = np.array(train_y, dtype=np.float64)
    val_y = np.array(val_y, dtype=np.float64)
    train_encodings = tokenizer(train_X, truncation=True, 
                                padding=True, max_length=128, 
                                return_tensors='pt')
    val_encodings = tokenizer(val_X, truncation=True, 
                              padding=True, max_length=128, 
                              return_tensors='pt')
    train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'],
                                                  train_encodings['attention_mask'],
                                                  torch.tensor(train_y, dtype=torch.float64))
    val_dataset = torch.utils.data.TensorDataset(val_encodings['input_ids'],
                                                val_encodings['attention_mask'],
                                                torch.tensor(val_y, dtype=torch.float64))
    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=32)
    return train_dataloader, val_dataloader

class Classifier(torch.nn.Module):
    def __init__(self, transformer_model, num_classes):
        super(Classifier, self).__init__()
        self.transformer = transformer_model
        self.fc = torch.nn.Linear(768, num_classes)  
        
    def forward(self, input_ids, attention_mask):
        output = self.transformer(input_ids=input_ids, 
                                  attention_mask=attention_mask)
        pooled_output = output.pooler_output 
        logits = self.fc(pooled_output)
        return logits
    
def download_models():
    model = AutoModel.from_pretrained("bert-base-uncased")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    return model, tokenizer
model, tokenizer = download_models()
    
def train_model(train_X, val_X, train_y, val_y, model=model, tokenizer=tokenizer):
    train_dataloader, val_dataloader = create_dataloader(tokenizer, train_X, val_X, train_y, val_y)
    num_classes = 6
    num_epochs = 3
    learning_rate = 1e-5

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = Classifier(model, num_classes)
    model = DataParallel(model)
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()
    
    for epoch in tqdm(range(num_epochs), desc="Epochs Progress "):
        model.train()
        total_loss = 0.0
        total_correct = 0

        # Training loop 
        for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1} "):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)

            labels = labels.to(device).long()
            outputs = outputs.float()

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == labels).sum().item()

#             if i  % 100 == 0:
#                 print(f"Loss: {loss:.7f}  |  Batch: [{i:>5d}/{len(train_dataloader):>5d}]")

        train_loss = total_loss / len(train_dataloader)
        train_accuracy = total_correct / len(train_dataloader.dataset)

    # Validation loop
    model.eval()
    total_val_loss = 0.0
    total_val_correct = 0
    val_predicted = []
    val_labels = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Validation "):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask)
            labels = labels.to(device).long()

            outputs = outputs.float()
            loss = criterion(outputs, labels)

            total_val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_val_correct += (predicted == labels).sum().item()
            val_predicted.extend(predicted.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

#                 if i % 50 == 0:
#                     print(f"Batch {i:4d}  |  Loss {loss.item():.4f}")
                    
    val_predicted = np.array(val_predicted)
    val_labels = np.array(val_labels)

    print(classification_report(val_labels, val_predicted, target_names = [f'Class {i}' for i in range(num_classes)]))

In [43]:
data_ge_augmented = pd.read_csv("goemotions_aug_0.3_0.2_0.4_0.3_4.csv")
data_ge_augmented_test = pd.read_csv("goemotions_aug_0.3_0.2_0.4_0.3_4_test.csv")
data_ge_sampled = pd.read_csv("goemotions_sampled_5166.csv")
data_ge_original = pd.read_csv("/kaggle/input/goemotions-7-emotions/goemotions.csv")
data_e_sampled = pd.read_csv("text_sampled_6000.csv")
data_e_augmented = pd.read_csv("text_aug_0.3_0.2_0.4_0.3_4.csv")
data_e_augmented_test = pd.read_csv("text_aug_0.3_0.2_0.4_0.3_4_test.csv")

print("GOEMOTIONS -------------------------------",
      f"Augmented Train Data : {data_ge_augmented.shape}",
      f"Sampled Test Data : {data_ge_augmented_test.shape}",
      f"Sampled Original Data : {data_ge_sampled.shape}",
      f"Original Data : {data_ge_original.shape}", 
      "EMOTIONS -------------------------------",
      f"Augmented Train Data : {data_e_augmented.shape}",
      f"Sampled Test Data : {data_e_augmented_test.shape}",
      f"Sampled Original Data : {data_e_sampled.shape}",
      sep='\n')

GOEMOTIONS -------------------------------
Augmented Train Data : (33056, 3)
Sampled Test Data : (1034, 4)
Sampled Original Data : (5166, 3)
Original Data : (54263, 3)
EMOTIONS -------------------------------
Augmented Train Data : (38400, 3)
Sampled Test Data : (1200, 4)
Sampled Original Data : (6000, 3)


In [51]:
%%time
# splitted_data = split_data(data_ge_sampled)
splitted_data = split_data(data_e_sampled)
train_model(*splitted_data)

Epochs Progress :   0%|          | 0/3 [00:00<?, ?it/s]

Training Epoch 1 :   0%|          | 0/150 [00:00<?, ?it/s]

Training Epoch 2 :   0%|          | 0/150 [00:00<?, ?it/s]

Training Epoch 3 :   0%|          | 0/150 [00:00<?, ?it/s]

Validation :   0%|          | 0/38 [00:00<?, ?it/s]

              precision    recall  f1-score   support

     Class 0       0.85      0.91      0.88       200
     Class 1       0.90      0.82      0.86       200
     Class 2       0.90      0.90      0.90       200
     Class 3       0.90      0.94      0.92       200
     Class 4       0.91      0.85      0.88       200
     Class 5       0.91      0.96      0.94       200

    accuracy                           0.90      1200
   macro avg       0.90      0.90      0.90      1200
weighted avg       0.90      0.90      0.90      1200



In [52]:
%%time

#### GoEmomtions Dataset ####
# train_model(data_ge_augmented['text'], data_ge_augmented_test['text'],
#            data_ge_augmented['label'], data_ge_augmented_test['label'])

#### Emotions Dataset ####
train_model(data_e_augmented['text'], data_e_augmented_test['text'],
           data_e_augmented['label'], data_e_augmented_test['label'])

Epochs Progress :   0%|          | 0/3 [00:00<?, ?it/s]

Training Epoch 1 :   0%|          | 0/1200 [00:00<?, ?it/s]

Training Epoch 2 :   0%|          | 0/1200 [00:00<?, ?it/s]

Training Epoch 3 :   0%|          | 0/1200 [00:00<?, ?it/s]

Validation :   0%|          | 0/38 [00:00<?, ?it/s]

              precision    recall  f1-score   support

     Class 0       0.83      0.82      0.83       200
     Class 1       0.92      0.80      0.86       200
     Class 2       0.84      0.95      0.89       200
     Class 3       0.87      0.86      0.87       200
     Class 4       0.84      0.81      0.83       200
     Class 5       0.88      0.93      0.91       200

    accuracy                           0.86      1200
   macro avg       0.86      0.86      0.86      1200
weighted avg       0.86      0.86      0.86      1200

