# Import required libraries

In [1]:
import nlpaug.augmenter.word as aug_word
import pandas as pd
import regex as re
from collections import Counter
from transformers import pipeline
from transformers import pipeline
import os
import sys
import numpy as np

In [2]:
# Use NLPAUG SynonymAug function to generate similar words (Below will replace atleast 2 words in a sentence with a max of 4 words replaced in a sentence passed.)
similar_word = aug_word.SynonymAug(aug_min=2, aug_max=4)


In [3]:
# Read Train, Test  data from files
train_file = pd.read_csv('../data/train.csv')
final_test_file = pd.read_csv('../data/test.csv')

In [4]:
# Print % of data available for each class label
data_per_class = Counter(train_file['label'])
print("Number of rows grouped by label are:")
for i, label in enumerate(data_per_class):
    print(
        f"{label} class rows: {data_per_class[label]} ({data_per_class[label] / train_file.shape[0]*100:.3f})% of total train data")


Number of rows grouped by label are:
unrelated class rows: 175598 (68.475)% of total train data
agreed class rows: 74238 (28.949)% of total train data
disagreed class rows: 6606 (2.576)% of total train data


In [5]:
# Data preprocessing step. Converts labels to numbers, converts titles to lowercase sentences without special chars
label_encoding = {"unrelated":0,"agreed":1,"disagreed":2}
# preprocessing to remove special chars and convert text to lowercase.
def preprocessing(txt):
    txt = re.sub('[^a-zA-Z0-9 ]', '', txt)
    txt = txt.lower()
    return txt

# convert text labels to numbers
def convert_labels(txt):
    return label_encoding[txt]


train_file['title1_en'] = train_file['title1_en'].apply(preprocessing)
train_file['title2_en'] = train_file['title2_en'].apply(preprocessing)
train_file['label'] = train_file['label'].apply(convert_labels)

final_test_file['title1_en'] = final_test_file['title1_en'].apply(preprocessing)
final_test_file['title2_en']= final_test_file['title2_en'].apply(preprocessing)

In [6]:
samp='test\nis bad'
print(samp)


test
is bad


In [7]:
samp=preprocessing(samp)
print(samp)


testis bad


In [8]:
# get the data for agreed and disagreed labels
disagreed_df = train_file.loc[train_file['label'] ==2]
agreed_df=train_file.loc[train_file['label'] ==1]

In [9]:
# Translation can be used as a data augmentation technique, commented as this takes long time especially with data this size.
# translator_en_fr = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr")
# translator_fr_de = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-de")
# translator_de_es = pipeline("translation", model="Helsinki-NLP/opus-mt-de-es")
# translator_es_en = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")

# def translate(ls):
#     en_fr = translator_en_fr(ls)
#     trans_en_fr = []
#     [trans_en_fr.append(x['translation_text']) for x in en_fr]

#     fr_de = translator_fr_de(trans_en_fr)
#     trans_fr_de = []
#     [trans_fr_de.append(x['translation_text']) for x in fr_de]

#     de_es = translator_de_es(trans_fr_de)
#     trans_de_es = []
#     [trans_de_es.append(x['translation_text']) for x in de_es]

#     es_en = translator_es_en(trans_de_es)
#     trans_es_en = []
#     [trans_es_en.append(x['translation_text'].lower()) for x in es_en]

#     return trans_es_en
# translated_disagreed_title1=disagreed_df['title1_en'].apply(translate)
# translated_disagreed_title2=disagreed_df['title2_en'].apply(translate)


In [10]:
# Data Augmentation step for disagreed class. Loops through the diagreed class rows and geenrates similar sentences.
def augmented_text(txt):
    generated_text=similar_word.augment(txt)
    return generated_text

for i in range(4):
    disagreed_data_title1_en=(disagreed_df['title1_en'].apply(augmented_text))
    disagreed_data_title2_en=(disagreed_df['title2_en'].apply(augmented_text))
    disagreed_data_label=pd.Series([2]*len(disagreed_data_title1_en))
    disagreed_data_title1_en.reset_index(drop=True,inplace=True)
    disagreed_data_title2_en.reset_index(drop=True, inplace=True)
    augmented_disagreed_data_df = pd.DataFrame({"title1_en": disagreed_data_title1_en, "title2_en": disagreed_data_title2_en,
                                            "label": disagreed_data_label})
    
    augmented_disagreed_data_df['title1_en'] = augmented_disagreed_data_df['title1_en'].apply(preprocessing)
    augmented_disagreed_data_df['title2_en'] = augmented_disagreed_data_df['title2_en'].apply(preprocessing)
    disagreed_df=pd.concat([disagreed_df, augmented_disagreed_data_df], ignore_index=True)



In [11]:
# Data Augmentation step for agreed class. Loops through the diagreed class rows and geenrates similar sentences.
for i in range(1):
    agreed_data_title1_en = (agreed_df['title1_en'].apply(augmented_text))
    agreed_data_title2_en = (agreed_df['title2_en'].apply(augmented_text))
    agreed_data_label = pd.Series([1]*len(agreed_data_title1_en))
    agreed_data_title1_en.reset_index(drop=True, inplace=True)
    agreed_data_title2_en.reset_index(drop=True, inplace=True)
    augmented_agreed_data_df = pd.DataFrame(
        {"title1_en": agreed_data_title1_en, "title2_en": agreed_data_title2_en, "label": agreed_data_label})

    augmented_agreed_data_df['title1_en'] = augmented_agreed_data_df['title1_en'].apply(preprocessing)
    augmented_agreed_data_df['title2_en'] = augmented_agreed_data_df['title2_en'].apply(preprocessing)
    agreed_df = pd.concat([agreed_df, augmented_agreed_data_df], ignore_index=True)


In [12]:
unrelated_df=train_file.loc[train_file['label'] ==0]
disagreed_df_orig=train_file.loc[train_file['label'] ==2]
disagreed_df=pd.concat([disagreed_df_orig,disagreed_df],ignore_index=True) # add original data to the augmented data

In [13]:
# Concat final training files.
final_train_df = pd.concat([unrelated_df, disagreed_df, agreed_df], ignore_index=True)

In [14]:
# Convert words to vocab and encode data
import spacy
tokenizer = spacy.load('en_core_web_sm')
def tokenize(text):
    return [str(token.text) for token in tokenizer.tokenizer(text)]

counts = Counter()
for index, row in train_file[['title1_en', 'title2_en']].iterrows():
    counts.update(tokenize(str(row['title1_en'])+" "+str(row['title2_en'])))

# Less frequent words can be deleted to decrease vocab size. Not necessary for this dataset
# deleted_words={}
# print(len(counts.keys()))
# for keys in list(counts):
#     if counts[keys]<5:
#         deleted_words[keys] = counts[keys]
#         del counts[keys]
# print(len(counts.keys()))
# print(deleted_words)

vocab2index = {"": 0, "UNK": 1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)


def encode_sentence(text, vocab2index=vocab2index, max_length=30):
    tokenized = tokenize(text)
    encoded = np.zeros(max_length, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(max_length, len(enc1))
    encoded[:length] = enc1[:length]
    
    return encoded, length
    
def join_encode_text(text):
    # print(type(text))
   combibed_text = str(text['title1_en'])+" "+str(text['title2_en'])
   return encode_sentence(combibed_text, vocab2index)[0]


final_train_df['encoded_titles_combined'] = final_train_df[['title1_en', 'title2_en']].apply(join_encode_text, axis=1)
final_train_df['title1_en_encoded']=final_train_df['title1_en'].apply(encode_sentence)
final_train_df['title2_en_encoded']=final_train_df['title2_en'].apply(encode_sentence)

In [17]:
#Checking data distribution by class after data augmentation
data_per_class = Counter(final_train_df['label'])
print("Number of rows grouped by label after data augmentation is:")
for i, label in enumerate(data_per_class):
    print(
        f"{label} class rows: {data_per_class[label]} ({data_per_class[label] / final_train_df.shape[0]*100:.3f})% of total train data")


Number of rows grouped by label after data augmentation is:
0 class rows: 175598 (40.240)% of total train data
2 class rows: 112302 (25.735)% of total train data
1 class rows: 148476 (34.025)% of total train data


In [18]:
# Save train file after data augmentation
if (os.path.exists('../data/augmented_training_file/')):
    final_train_df.to_csv('../data/augmented_training_file/final_train_file2.csv', index=False, columns=[
                          'title1_en', 'title2_en', 'title1_en_encoded', 'title2_en_encoded', 'encoded_titles_combined', 'label'])


In [19]:
# encode test file 
final_test_file['encoded_titles_combined'] = final_test_file[['title1_en', 'title2_en']].apply(join_encode_text, axis=1)
final_test_file['title1_en_encoded']=final_test_file['title1_en'].apply(encode_sentence)
final_test_file['title2_en_encoded']=final_test_file['title2_en'].apply(encode_sentence)

In [20]:
#Save test file
final_test_file.to_csv('../data/final_test_file.csv', index=False, columns=[
    'title1_en', 'title2_en', 'title1_en_encoded', 'title2_en_encoded', 'encoded_titles_combined'])


In [21]:
print(f"Length of vocab from train file is {len(vocab2index)}")

Length of vocab from train file is 49491
