In [18]:
import nlpaug.augmenter.word as aug_word
import pandas as pd
import regex as re
from collections import Counter
from transformers import pipeline
from transformers import pipeline
import os
import sys

In [2]:
similar_word = aug_word.SynonymAug(aug_min=2, aug_max=4)


In [3]:
train_file = pd.read_csv('../data//train.csv')
final_test_file = pd.read_csv('../data/test.csv')

In [4]:
data_per_class = Counter(train_file['label'])
print("Number of rows grouped by label are:")
for i, label in enumerate(data_per_class):
    print(
        f"{label} class rows: {data_per_class[label]} ({data_per_class[label] / train_file.shape[0]*100:.3f})% of total train data")


Number of rows grouped by label are:
unrelated class rows: 175598 (68.475)% of total train data
agreed class rows: 74238 (28.949)% of total train data
disagreed class rows: 6606 (2.576)% of total train data


In [5]:
label_encoding = {"unrelated":0,"agreed":1,"disagreed":2}
# preprocessing to remove special chars and convert text to lowercase.
def preprocessing(txt):
    txt = re.sub('[^a-zA-Z0-9 ]', '', txt)
    txt = txt.lower()
    return txt

# convert text labels to numbers
def convert_labels(txt):
    return label_encoding[txt]


train_file['title1_en'] = train_file['title1_en'].apply(preprocessing)
train_file['title2_en'] = train_file['title2_en'].apply(preprocessing)
train_file['label'] = train_file['label'].apply(convert_labels)

final_test_file['title1_en'] = final_test_file['title1_en'].apply(preprocessing)
final_test_file['title2_en']= final_test_file['title2_en'].apply(preprocessing)



In [6]:
data_per_class = Counter(train_file['label'])
print("Number of rows grouped by label are:")
for i, label in enumerate(data_per_class):
    print(
        f"{label} class rows: {data_per_class[label]} ({data_per_class[label] / train_file.shape[0]*100:.3f})% of total train data")


Number of rows grouped by label are:
0 class rows: 175598 (68.475)% of total train data
1 class rows: 74238 (28.949)% of total train data
2 class rows: 6606 (2.576)% of total train data


In [7]:
# get the data for agreed and disagreed labels
disagreed_df = train_file.loc[train_file['label'] ==2]
agreed_df=train_file.loc[train_file['label'] ==1]

In [8]:

# translator_en_fr = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr")
# translator_fr_de = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-de")
# translator_de_es = pipeline("translation", model="Helsinki-NLP/opus-mt-de-es")
# translator_es_en = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")

# def translate(ls):
#     en_fr = translator_en_fr(ls)
#     trans_en_fr = []
#     [trans_en_fr.append(x['translation_text']) for x in en_fr]

#     fr_de = translator_fr_de(trans_en_fr)
#     trans_fr_de = []
#     [trans_fr_de.append(x['translation_text']) for x in fr_de]

#     de_es = translator_de_es(trans_fr_de)
#     trans_de_es = []
#     [trans_de_es.append(x['translation_text']) for x in de_es]

#     es_en = translator_es_en(trans_de_es)
#     trans_es_en = []
#     [trans_es_en.append(x['translation_text'].lower()) for x in es_en]

#     return trans_es_en


In [9]:
# translated_disagreed_title1=disagreed_df['title1_en'].apply(translate)
# translated_disagreed_title2=disagreed_df['title2_en'].apply(translate)


In [10]:
def augmented_text(txt):
    generated_text=similar_word.augment(txt)
    return generated_text



In [11]:
for i in range(3):
    disagreed_data_title1_en=(disagreed_df['title1_en'].apply(augmented_text))
    disagreed_data_title2_en=(disagreed_df['title2_en'].apply(augmented_text))
    disagreed_data_label=pd.Series([2]*len(disagreed_data_title1_en))
    disagreed_data_title1_en.reset_index(drop=True,inplace=True)
    disagreed_data_title2_en.reset_index(drop=True, inplace=True)
    augmented_disagreed_data_df = pd.DataFrame({"title1_en": disagreed_data_title1_en, "title2_en": disagreed_data_title2_en,
                                            "label": disagreed_data_label})
    
    augmented_disagreed_data_df['title1_en'] = augmented_disagreed_data_df['title1_en'].apply(preprocessing)
    augmented_disagreed_data_df['title2_en'] = augmented_disagreed_data_df['title2_en'].apply(preprocessing)
    disagreed_df=pd.concat([disagreed_df, augmented_disagreed_data_df], ignore_index=True)



In [12]:
# # Augmentation for agreed class
# for i in range(1):
#     agreed_data_title1_en = (agreed_df['title1_en'].apply(augmented_text))
#     agreed_data_title2_en = (agreed_df['title2_en'].apply(augmented_text))
#     agreed_data_label = pd.Series([1]*len(agreed_data_title1_en))
#     agreed_data_title1_en.reset_index(drop=True, inplace=True)
#     agreed_data_title2_en.reset_index(drop=True, inplace=True)
#     augmented_agreed_data_df = pd.DataFrame(
#         {"title1_en": agreed_data_title1_en, "title2_en": agreed_data_title2_en, "label": agreed_data_label})

#     augmented_agreed_data_df['title1_en'] = augmented_agreed_data_df['title1_en'].apply(
#         preprocessing)
#     augmented_agreed_data_df['title2_en'] = augmented_agreed_data_df['title2_en'].apply(
#         preprocessing)
#     agreed_df = pd.concat(
#         [agreed_df, augmented_agreed_data_df], ignore_index=True)


In [13]:
unrelated_df=train_file.loc[train_file['label'] ==0]
agreed_df=train_file.loc[train_file['label'] ==1]
disagreed_df_orig=train_file.loc[train_file['label'] ==2]

In [14]:
disagreed_df=pd.concat([disagreed_df_orig,disagreed_df],ignore_index=True)

In [15]:
final_train_df = pd.concat([unrelated_df, disagreed_df, agreed_df], ignore_index=True)


In [16]:
data_per_class = Counter(final_train_df['label'])
print("Number of rows grouped by label after data augmentation is:")
for i, label in enumerate(data_per_class):
    print(
        f"{label} class rows: {data_per_class[label]} ({data_per_class[label] / final_train_df.shape[0]*100:.3f})% of total train data")


Number of rows grouped by label after data augmentation is:
0 class rows: 175598 (56.775)% of total train data
2 class rows: 59454 (19.223)% of total train data
1 class rows: 74238 (24.003)% of total train data


In [21]:
if (os.path.exists('../data/augmented_training_file/')):
    final_train_df.to_csv('../data/augmented_training_file/final_train_file.csv',index=False,columns=['title1_en','title2_en','label'])
