# Data Aumentation for NT-Grief
> This notebook focuses on utilizing the Back Translation technique to augment the minority class in sentiment datasets, specifically tailored for the NTgrief dataset. It creates an augmented file in English by first translating the minority class to German and then back to English. With appropriate adjustments, this technique can be similarly applied to the Spanish dataset (by translating to German and then back to Spanish).

In [None]:
# Import necessary libraries
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
# Load the dataset
file_path = 'path_to_file_here'
df = pd.read_csv(file_path, sep='\t', usecols=["id", "tweet", "label"])

In [None]:
# Report the number of sentences
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display the class distribution
count_class_0, count_class_1 = df['label'].value_counts()
print(count_class_0, count_class_1)

In [None]:
# Separate the classes
df_class_0 = df[df['label'] == 0]
df_class_1 = df[df["label"] == 1]


In [None]:
# Filter for class 1 and convert text to lowercase
df = df[df['label'] == 1]
df['tweet'] = df['tweet'].str.lower()

In [None]:
# Initialize tokenizers and translation models
tokenizer_en_de = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de", use_fast=False)
model_en_de = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")

tokenizer_de_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-de-en", use_fast=False)
model_de_en = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-de-en")

In [None]:
# Initialize an auxiliary dataframe
df_aux = pd.DataFrame(columns=['id', 'tweet', 'label'])
aux = []

In [None]:
# Perform back-translation
for f in df['tweet']:
    original_text = f
    print("Original: ", original_text)

    # Translate from English to German
    tokenized_text_en_de = tokenizer_en_de.prepare_seq2seq_batch(original_text, return_tensors='pt')
    translation_en_de = model_en_de.generate(**tokenized_text_en_de)
    translated_text_en_de = tokenizer_en_de.batch_decode(translation_en_de, skip_special_tokens=True)[0]

    # Translate back from German to English
    tokenized_text_de_en = tokenizer_de_en.prepare_seq2seq_batch(translated_text_en_de, return_tensors='pt')
    translation_de_en = model_de_en.generate(**tokenized_text_de_en)
    translated_text_de_en = tokenizer_de_en.batch_decode(translation_de_en, skip_special_tokens=True)[0]

    print("Conversion: ", translated_text_de_en)

    # Append the translated text to the auxiliary dataframe
    aux.append(translated_text_de_en)

In [None]:
# Update the auxiliary dataframe
df_aux['id'] = df['id']
df_aux['tweet'] = aux
df_aux['label'] = df['label']

In [None]:
# Concatenate all dataframes to get the final augmented dataset
df_final = pd.concat([df_aux, df_class_0], axis=0)
df_final = pd.concat([df_final, df_class_1], axis=0)

In [None]:
# Save the augmented dataset
output_file_path = 'path_to_train_output_file_here'
df_final.to_csv(output_file_path, index=False, header=True, sep='\t')


In [None]:
# Display class distribution in the augmented dataset
count_class_0_aug, count_class_1_aug = df_final['label'].value_counts()
print(count_class_0_aug, count_class_1_aug)