In [None]:
import os
ROOT = '/home/mav204/Documents/minor-project'
os.chdir(ROOT)
print(ROOT)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd drive/MyDrive/minor-project

In [None]:
from math import ceil
from misc.dataset_modifier import separate, save_json, get_json
from augment.back_translation import Translator
from augment.synonym_replacement import SynonymAugmenter

In [None]:
TL = Translator()
SYN = SynonymAugmenter(max_attempts=60)

In [None]:
train_data = get_json(f'./data/curated/train.json')
questions = get_json(f'./data/metadata/acceptable.json')

correct_baseline = 50
partially_correct_baseline = 50
incorrect_baseline = 25

lookup_table = {}

for q in questions:
    qc = questions[q]['count']  # shorthand

    c = qc['correct']
    p = qc['partially correct']
    i = qc['incorrect']

    lookup_table[q] = {
        'correct': ceil(max(0, correct_baseline - c) / c),
        'partially correct': ceil(max(0, partially_correct_baseline - p) / p),
        'incorrect': ceil(max(0, incorrect_baseline - i) / i)
    }


In [None]:
qid = ''
no_of_augmentations = 0
sim_ub = .85
sim_lb = .7
augmentations = []
total_generation = 0

In [None]:
def back_translate(text):
    # --- P1: EN → ES → EN ---
    es = TL.translate(text, 'en', 'es')
    bt_es = TL.translate(es, 'es', 'en')
    sim1 = SYN.semantic_sim(text, bt_es)

    bt_fr = None
    sim2 = None

    bt_es_fr = None
    sim3 = None

    # --- Only try FR if ES version too close ---
    if sim1 > sim_ub:
        # P2: EN → FR → EN
        fr = TL.translate(text, 'en', 'fr')
        bt_fr = TL.translate(fr, 'fr', 'en')
        sim2 = SYN.semantic_sim(text, bt_fr)

        # If FR also too close → try triple pivot
        if sim2 > sim_ub:
            # P3: EN → ES → FR → EN
            es2 = TL.translate(text, 'en', 'es')
            es_fr = TL.translate(es2, 'es', 'fr')
            bt_es_fr = TL.translate(es_fr, 'fr', 'en')
            sim3 = SYN.semantic_sim(text, bt_es_fr)

    # --- Gather all candidates ---
    candidates = []

    candidates.append(("es", bt_es, sim1))

    if bt_fr is not None:
        candidates.append(("fr", bt_fr, sim2))

    if bt_es_fr is not None:
        candidates.append(("es_fr", bt_es_fr, sim3))

    # --- Score each ---
    scored = []
    for tag, text_out, sim in candidates:
        score = SYN.deviation_score(sim)
        scored.append((score, tag, text_out, sim))

    # pick lowest deviation score
    best_score, best_tag, best_text, best_sim = min(scored, key=lambda x: x[0])

    print(f"[BT] Best path: {best_tag} | sim={best_sim:.4f}")

    return best_text, best_sim

In [None]:
temp = []
count = 0

for record in train_data:
    qid = record['id'][-4:]
    num_augments = lookup_table[qid][record['verification_feedback']]

    for i in range(num_augments):
        answer = record['provided_answer']

        # Back-translation (best of ES / FR / ES→FR)
        variant_bt, sim_bt = back_translate(answer)

        # Synonyms (single attempt, but SY N already does retries internally)
        variant_syn = SYN.augment(answer)
        sim_syn = SYN.semantic_sim(answer, variant_syn)

        print(f"Sim | BT: {sim_bt:.4f} | SYN: {sim_syn:.4f}")

        # Score variants
        score_bt = SYN.deviation_score(sim_bt)
        score_syn = SYN.deviation_score(sim_syn)

        print(f"Score | BT: {score_bt:.4f} | SYN: {score_syn:.4f}")

        # Pick best
        best_variant = variant_bt if score_bt < score_syn else variant_syn
        temp.append(best_variant)
        count += 1

    # logging
    print(
        f"Number of augmentations: {count}\n"
        f"Question: {record['id']}\n"
        f"Class: {record['verification_feedback']}\n"
    )

    # add to dataset
    for i, aug in enumerate(temp):
        augmentations.append({
            'id': f'aug{i:02}{record["id"]}',
            'question': record['question'],
            'reference_answer': record['reference_answer'],
            'provided_answer': aug,
            'answer_feedback': record['answer_feedback'],
            'verification_feedback': record['verification_feedback'],
            'max_score': record['max_score'],
            'normalized_score': record['normalized_score']
        })

    temp = []  # reset

    print(f"Augmentation length: {len(augmentations)}")


In [None]:
save_json(augmentations, './data/augmented/train.json', ask=False)