In [None]:
import pandas as pd
import numpy as np
import time


import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer, MarianMTModel, MarianTokenizer

In [None]:
# Load file
english_df = pd.read_csv("../data/english_dataset_dan_terjemahan.csv")

# Cek isi
print(english_df.head())

     label                                         final_text  \
0  sadness                            i didnt feel humiliated   
1  sadness  i can go from feeling so hopeless to so damned...   
2    anger   im grabbing a minute to post i feel greedy wrong   
3     love  i am ever feeling nostalgic about the fireplac...   
4    anger                               i am feeling grouchy   

                                                teks  
0                      aku tidak merasa dipermalukan  
1  Saya bisa berubah dari tadinya perasaan putus ...  
2  gua meluangkan waktu sebentar untuk memposting...  
3  Saya pernah merasa nostalgia tentang perapian,...  
4                            aku sedang merasa kesal  


In [None]:
# Cek persebaran dari label
print(english_df['label'].value_counts())

label
joy         6740
sadness     5793
anger       2703
fear        2369
love        1630
surprise     713
Name: count, dtype: int64


In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Load model terjemahan Indo -> English dan English -> Indo
id_en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-id-en")
id_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-id-en")

en_id_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-id")
en_id_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-id")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/796k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/291M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/291M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/796k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/291M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/291M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
def back_translate(text):
    try:
        # Indo ke Inggris
        en_tokens = id_en_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        en_translation = id_en_model.generate(**en_tokens)
        en_text = id_en_tokenizer.decode(en_translation[0], skip_special_tokens=True)

        # Inggris ke Indo
        id_tokens = en_id_tokenizer(en_text, return_tensors="pt", padding=True, truncation=True)
        back_translation = en_id_model.generate(**id_tokens)
        id_text = en_id_tokenizer.decode(back_translation[0], skip_special_tokens=True)

        return id_text
    except Exception as e:
        print(f"[ERROR] {e}")
        return text


In [None]:
# Target: augmentasi label minoritas (< 1600)
augmented_rows = []
threshold = 3000
for label in english_df['label'].unique():
    label_df = english_df[english_df['label'] == label]
    if len(label_df) < threshold:
        needed = threshold - len(label_df)
        print(f"üîÅ Augmenting '{label}' with {needed} samples...")
        for i in range(min(needed, len(label_df))):
            original = label_df.iloc[i]
            new_text = back_translate(original['teks'])
            augmented_rows.append({'teks': new_text, 'label': label})
            time.sleep(0.1)  # hindari flood request pada model besar

üîÅ Augmenting 'anger' with 297 samples...
üîÅ Augmenting 'love' with 1370 samples...
üîÅ Augmenting 'surprise' with 2287 samples...
üîÅ Augmenting 'fear' with 631 samples...


In [None]:
aug_df = pd.DataFrame(augmented_rows).drop_duplicates(subset='teks')
english_df = pd.concat([english_df, aug_df], ignore_index=True)


In [None]:
# Cek persebaran dari label
print(english_df['label'].value_counts())

label
joy         6740
sadness     5793
love        2998
anger       2997
fear        2995
surprise    1419
Name: count, dtype: int64


In [None]:
english_df.to_csv('translated_augmented.csv', index=False)