In [None]:
!pip install nlpaug
!pip install googletrans==4.0.0-rc1

In [None]:
import nlpaug.augmenter.char as nac
from googletrans import Translator
import string
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from nlpaug.util import Action

In [None]:
device = "cuda"

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    cleaned_text = text.translate(translator)
    return cleaned_text

def paraphrase_thai(text):
    # Translate to English
    translator = Translator()
    english_text = translator.translate(text, dest='en')
    en_text = remove_punctuation(english_text.text)

    # Paraphrase in English (you can use other libraries or models for this step)
    paraphrased_englishs = paraphrase_function(en_text)

    # Translate back to Thai
    th_texts =[]

    for paraphrased_english in paraphrased_englishs:
      paraphrased_thai = translator.translate(paraphrased_english, dest='th')
      th_text = remove_punctuation(paraphrased_thai.text)
      th_texts.append(th_text)

    return th_texts

def paraphrase_function(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.to('cuda:0')

    outputs = model.generate(
        input_ids, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

def random_insert(text, aug_char_p=0.05):
    thai_letters = [chr(code) for code in range(0xE01, 0xE3A)]  # U+0E01 to U+0E39
    aug_insert = nac.RandomCharAug(action='insert', candidates=thai_letters, aug_char_p=aug_char_p)
    insert_augmented = aug_insert.augment(text)
    insert_augment_noweirdspace = insert_augmented[0].replace('\xa0', '')
    return insert_augment_noweirdspace.replace(' ', '')

def random_deletion(text, aug_char_p=0.05):
    aug_delete = nac.RandomCharAug(action='delete', aug_char_p=aug_char_p)
    delete_augmented = aug_delete.augment(text)
    delete_augment_noweirdspace = delete_augmented[0].replace('\xa0', '')
    return delete_augment_noweirdspace.replace(' ', '')

def random_swap(text, aug_char_p=0.05):
    aug_swap = nac.RandomCharAug(action='swap', aug_char_p=aug_char_p)
    swap_augmented = aug_swap.augment(text)
    swap_augment_noweirdspace = swap_augmented[0].replace('\xa0', ' ')
    return swap_augment_noweirdspace.replace(' ', '')

def back_translation(text, destination_language='en'):
    translator = Translator()
    translation = translator.translate(text, dest=destination_language)
    return translation.text

In [None]:
# Sample Thai question
thai_question = "วิทยาศาสตร์ควอนตัมคืออะไร"

# Call the functions and store the augmented texts
insert_augmented = random_insert(thai_question)
delete_augmented = random_deletion(thai_question)
swap_augmented = random_swap(thai_question)
back_translation_result = back_translation(thai_question)
paraphrased_texts = paraphrase_thai(thai_question)

# Display the augmented texts
print(f"Random Insertion: {insert_augmented}")
print(f"Random Deletion: {delete_augmented}")
print(f"Random Swap: {swap_augmented}")
print(f"Back Translation: {back_translation_result}")
for paraphrased_text in paraphrased_texts:
  print(f"Paraphrased text: {paraphrased_text}")


In [None]:
import pandas as pd

Question_Counter = 0
augment_counter = 0
# Read the CSV file
input_csv_path = 'qalist.csv'
output_csv_path = 'augmented_output_file.csv'

df = pd.read_csv(input_csv_path)

# Create a new DataFrame to store augmented data
augmented_df = pd.DataFrame(columns=df.columns)

# Augment each row in the original DataFrame and add as new rows in the augmented DataFrame
for index, row in df.iterrows():
    Question_Counter+=1
    print(f"Quesiton:{Question_Counter}")
    thai_question = row['question']
    index_class = row['class']

    aug_questions=[]

    aug_questions.append(random_insert(thai_question))
    aug_questions.append(random_insert(thai_question))
    aug_questions.append(random_deletion(thai_question))
    aug_questions.append(random_deletion(thai_question))
    aug_questions.append(random_swap(thai_question))
    aug_questions.append(random_swap(thai_question))
    aug_questions.extend(paraphrase_thai(thai_question))

    # Append augmented data as new rows in the augmented DataFrame
    for aug_question in aug_questions:
      augment_counter+=1
      print(f"Augment:{augment_counter}")
      aug_df = pd.DataFrame({
        'question': aug_questions,
        'class': index_class
      })

      # Concatenate the augmented DataFrame with the original DataFrame
      augmented_df = pd.concat([augmented_df, aug_df], ignore_index=True)

# Concatenate the original DataFrame with the augmented DataFrame
result_df = pd.concat([df, augmented_df], ignore_index=True)

# Save the result DataFrame to a new CSV file
result_df.to_csv(output_csv_path, index=False)