## AUGMENTATION FOR SIMPLE TRAINING

In [7]:
import random
from nltk.corpus import wordnet
import pandas as pd

# Ensure we have the necessary NLTK data
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
import pandas as pd
import random
from nltk.corpus import wordnet

def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)
    
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if len(synonyms) >= 1:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n: 
            break

    sentence = ' '.join(new_words)
    return sentence

def random_insertion(sentence, n):
    words = sentence.split()
    for _ in range(n):
        add_word = random.choice(words)
        synonyms = wordnet.synsets(add_word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            random_idx = random.randint(0, len(words))
            words.insert(random_idx, synonym)
    return ' '.join(words)

def random_swap(sentence, n):
    words = sentence.split()
    if len(words) < 2:
        return sentence

    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

def random_deletion(sentence, p):
    words = sentence.split()
    if len(words) == 1:
        return sentence

    new_words = []
    for word in words:
        if random.uniform(0, 1) > p:
            new_words.append(word)
    if len(new_words) == 0:
        return random.choice(words)
    else:
        return ' '.join(new_words)

# Assume 'data' is already loaded as a DataFrame
data = pd.read_csv('./dataset/processed dataset/TRAINING_INDOBERT.csv')

augmented_data = []

for index, row in data.iterrows():
    original_sentence = row['preprocessed_formal']
    label = row['label']
    
    augmented_data.append([original_sentence, label])
    
    if label == "Politik":
        continue
    
    # Create 3 augmented versions of each sentence
    for _ in range(3):
        aug_sent = synonym_replacement(original_sentence, n=1)
        augmented_data.append([aug_sent, label])
        
        aug_sent = random_insertion(original_sentence, n=1)
        augmented_data.append([aug_sent, label])
        
        aug_sent = random_swap(original_sentence, n=1)
        augmented_data.append([aug_sent, label])
        
        aug_sent = random_deletion(original_sentence, p=0.1)
        augmented_data.append([aug_sent, label])

# Convert to DataFrame
augmented_df = pd.DataFrame(augmented_data, columns=['preprocessed_formal', 'label'])


augmented_df.drop_duplicates(inplace=True)
# Save the augmented data to a CSV file
augmented_df.to_csv('augmented_data.csv', index=False)


In [17]:
augmented_df['label'].value_counts()

label
Sosial Budaya              3442
Politik                    2893
Ideologi                   2561
Ekonomi                    2513
Pertahanan dan Keamanan    2495
Sumber Daya Alam           1211
Demografi                   494
Geografi                    158
Name: count, dtype: int64