In [2]:
# Downloads wordnet from the NLTK for synonym replacement.
import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet as wn

### Importing pandas to organise the data for analysis
import pandas as pd

import random

### Importing MarianMTModel and tokenizer for backtranslation
from transformers import MarianMTModel, MarianTokenizer

### Initialise the translation model that can translate from English to any of the Romance languages (French, Portugeuse, etc.)
r_model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
en_model_name = 'Helsinki-NLP/opus-mt-ROMANCE-en'


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ivan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Function List

### Function to translate the texts into French and back to English
def back_translate(text, r_model_name, en_model_name):
    r_tokenizer = MarianTokenizer.from_pretrained(r_model_name)
    r_model = MarianMTModel.from_pretrained(r_model_name)

    en_tokenizer = MarianTokenizer.from_pretrained(en_model_name)
    en_model = MarianMTModel.from_pretrained(en_model_name)

    ### Translating from English to French. num_beams refer to the number of probabilities possible, choosing the highest probabilities from eg. 3 beams.
    encoded_text = r_tokenizer(text, return_tensors = "pt", padding = True, truncation = True)
    fr_translation = r_model.generate(**encoded_text, max_length = 60, num_beams = 3)

    ### Decoding the french translation
    fr_decoded_text = r_tokenizer.decode(fr_translation[0], skip_special_tokens = True)

    ### Back-Translating from French to English
    fr_encoded_text = en_tokenizer(fr_decoded_text, return_tensors = "pt", padding = True, truncation = True)

    en_translation = en_model.generate(**fr_encoded_text, max_length = 60, num_beams = 3)

    back_translation = en_tokenizer.decode(en_translation[0], skip_special_tokens = True)

    return back_translation

def get_synonyms(word):
    synonyms = set()

    for syn in wn.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' abcdefghijklmnopqrstuvwxyz'])
            synonyms.add(synonym)
        
    if word in synonyms:
        synonyms.remove(word)
    
    return list(synonyms)

def synonym_replacement(words, n):

    words = words.split()
    
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word.isalnum()]))
    random.shuffle(random_word_list)
    num_replaced = 0

    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)

        if(len(synonyms) >= 1):
            synonym = random.choice(list(synonyms))

            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1

        if num_replaced >= n:
            break
    
    sentence =  ' '.join(new_words)

    return sentence

def add_word(new_words):
    
    synonyms = []
    counter = 0

    while len(synonyms) < 1:
        random_word = random.choice(new_words)
        synonyms = get_synonyms(random_word)
        counter += 1
        
        if counter >= 10:
            return
    
    random_synonym = random.choice(synonyms)
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

def random_insertion(words, n):

    words = words.split()
    new_words = words.copy()

    for i in range(n):
        add_word(new_words)
    
    sentence = ' '.join(new_words)
    return sentence

In [None]:
### Grabs .CSV file containing the cleaned Reddit data
file = 'data/cleanedRedditData5000.csv'
df = pd.read_csv(file)

In [None]:
### Empty DataFrame created for the augmented data 
augmented_df = pd.DataFrame(columns=df.columns)

### Specifying the number of augmentations for each row in the dataset
augmentations_qty = 3

### for loop to perform augmentations on the rows augmentations_qty times
augmented_rows = []

# Random Insertion and Synonym Replacement

In [None]:
for i, row in df.iterrows():
    for i in range(augmentations_qty):
        print('cleaned_text : ',row['cleaned_text'])
        # Synonym Replacement
        augmented_text = synonym_replacement(row['cleaned_text'], n = 4) # Performing Synonym Replacement (SR) on the current row (n being the number of words being replaced)
        print('augmented text: ', augmented_text)
        # Random Insertion
        augmented_text = random_insertion(augmented_text, n = 6)

        # # Back Translation
        # augmented_text = back_translate(augmented_text, r_model_name, en_model_name)

        # Create a new row for the augmented text without affecting other columns
        new_row = row.copy()
        new_row['cleaned_text'] = augmented_text

        augmented_rows.append(new_row)

print("SR and RI Complete")

# Back Translation

In [None]:
### Small Subset to perform Back-Translation
df_subset = df.sample(n = 50, random_state = 42)

backtranslated_rows = []

for i, row in df_subset.iterrows():
    back_translated_text = back_translate(row['cleaned_text'], r_model_name, en_model_name)
    new_row = row.copy()
    new_row['cleaned_text'] = back_translated_text
    backtranslated_rows.append(new_row)
    print(len(backtranslated_rows))

# Undersampling Majority Positive Labels 


In [10]:
file = 'data/augmentedredditdata_labelled.csv'
df = pd.read_csv(file, index_col=0)
df.head()

Unnamed: 0_level_0,text,score,comments,submission_date,cleaned_title,cleaned_text,text_sentiment_label,text_compound_score,title_sentiment_label,title_compound_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Is it worth getting the iPhone 15?,I've seen a ton of negative reviews: \n\-Easi...,14,42,2023-09-30 15:23:12,worth get iPhone,see ton negative review \n \-easily crack tre...,Positive,0.25,Positive,0.2263
Why is the demand for the iPhone 15 series so high this year?,I thought iPhone 13 Pro and 14 Pro series alre...,458,689,2023-09-23 11:49:19,demand iPhone series high year,think iPhone Pro Pro series pretty high de...,Positive,0.9153,Negative,-0.128
"Has anyone got the base model of the iPhone 15/plus, what are your thoughts?","Also, how's the camera and battery life?\n\nfe...",30,92,2023-10-09 06:46:41,get base model iPhone /plus thought,camera battery life \n\n feel free share pic c...,Positive,0.7845,Neutral,0.0
15 Plus thoughts - back to iPhone after 3 years,After three long years and handful of android ...,776,401,2023-09-30 07:39:32,plus thought iPhone year,long year handful android phone finally iPhone...,Negative,-0.1953,Neutral,0.0
Anyone bought iphone 15 pro,Anyone here with iphone 15 pro facing absolute...,97,298,2023-10-17 03:44:46,buy iphone pro,iphone pro face absolutely issue \n\n pro ...,Positive,0.128,Neutral,0.0


# Preparation to move to .csv file

In [None]:
back_translated_df = pd.DataFrame(backtranslated_rows, columns = df.columns)

augmented_df = pd.DataFrame(augmented_rows, columns = df.columns)

augmented_df = pd.concat([augmented_df, back_translated_df], ignore_index = True)

final_df = pd.concat([df, augmented_df], ignore_index = True)

In [None]:
final_df.to_csv('data/cleanedRedditData5000_AUGMENTED.csv', index = False)