In [None]:
!pip install nlpaug nltk
!pip install -q -U bitsandbytes

# !pip install -q -U git+https://github.com/huggingface/transformers.git
# !pip install -q -U git+https://github.com/huggingface/peft.git

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import torch
import gc
import os
import numpy as np
import random
import nlpaug.augmenter.word as naw
import nltk
from tqdm import tqdm

In [None]:
# Connecting to google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
# Data
DATA_DIR = "../Dataset"
MODELS_DIR = "../Modelli_BERT-with_data_aug/train-augmented-bt-llm"

# Model
MODEL = 'bert-large-uncased'
EPOCHS = 10
BATCH_SIZE = 8
LEARNING_RATE = 2e-5

# Reproducibility
SEED = 42
set_seed(SEED)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Device selected: {device}")

In [None]:
# Load dataset
df_train = pd.read_csv(f"{DATA_DIR}/Augmented/train_augmented_BT-LLM.csv") # or train_augmented_Swap.csv
df_test = pd.read_csv(f"{DATA_DIR}/valid.csv")

In [None]:
df_train = df_train.dropna(subset=['text', 'label', 'variety', 'source', 'task'])
df_test = df_test.dropna(subset=['text', 'label', 'variety', 'source', 'task'])

## Fine-tuning the models

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL)
model = BertForSequenceClassification.from_pretrained(MODEL)

In [None]:
report_data = []

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [None]:
# TRAINING CYCLE
grouped_train = df_train.groupby(['variety', 'source', 'task'])

print(f"Start training on {len(grouped_train)} combinations...")

for (variety, source, task), df_group in grouped_train:
    run_id = f"{variety}_{source}_{task}".replace(" ", "_")
    if not run_id.startswith("en-AU"):
        save_path = os.path.join(MODELS_DIR, run_id)

        print(f"\nTraining combination: {run_id} (Samples: {len(df_group)})")


        # Dataset setup
        train_ds = Dataset.from_pandas(df_group.reset_index(drop=True))
        tokenized_train = train_ds.map(tokenize_function, batched=True)

        # Model setup
        num_labels = df_train['label'].nunique()
        model = BertForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels)

        # Setup trainer
        training_args = TrainingArguments(
            output_dir=f"./checkpoints_temp/{run_id}",
            num_train_epochs=EPOCHS,
            per_device_train_batch_size=BATCH_SIZE,
            learning_rate=LEARNING_RATE,
            save_strategy="no",
            eval_strategy="no",
            report_to="none"
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train
        )

        trainer.train()

        # Salvataggio Finale
        print(f"Salvataggio in: {save_path}")
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)

        # Pulizia Memoria
        del model, trainer, tokenized_train
        torch.cuda.empty_cache()
        gc.collect()

print("\nTraining completed")

### Test the models

In [None]:
report_data = []

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary", pos_label=1)
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [None]:
# EVALUATION CYCLE
grouped_val = df_test.groupby(['variety', 'source', 'task'])

print(f"Start validation of {len(grouped_val)} combinations...")

for (variety, source, task), df_group in grouped_val:
    run_id = f"{variety}_{source}_{task}".replace(" ", "_")
    model_path = os.path.join(MODELS_DIR, run_id)

    print(f"\nTesting combination: {run_id} (Samples: {len(df_group)})")

    # Loading the right modeL
    if not os.path.exists(model_path):
        print(f"Model not found {model_path}")
        report_data.append({
            "variety": variety, "source": source, "task": task,
            "status": "Model Missing"
        })
        continue

    val_ds = Dataset.from_pandas(df_group.reset_index(drop=True))
    tokenizer = BertTokenizer.from_pretrained(model_path)

    tokenized_val = val_ds.map(tokenize_function, batched=True)

    model = BertForSequenceClassification.from_pretrained(model_path)


    args = TrainingArguments(
    output_dir="tmp",
    report_to="none",
    logging_strategy="no"
    )

    trainer = Trainer(
        model=model,
        args=args,
        compute_metrics=compute_metrics
    )


    results = trainer.predict(tokenized_val)
    metrics = results.metrics


    print(f"Accuracy: {metrics['test_accuracy']:.4%} | F1: {metrics['test_f1']:.4f}")


    # Saving results report
    report_data.append({
        "variety": variety,
        "source": source,
        "task": task,
        "status": "Success",
        "accuracy": metrics['test_accuracy'],
        "f1": metrics['test_f1'],
        "precision": metrics['test_precision'],
        "recall": metrics['test_recall'],
        "num_samples": len(df_group)
    })

    # Cleaning
    del model, trainer, tokenized_val
    torch.cuda.empty_cache()

# Export final report
report_path = os.path.join(MODELS_DIR, "report_performance_SWAP.csv")
df_report = pd.DataFrame(report_data)
df_report.to_csv(report_path, index=False)
print("\nTesting completed")

## Data Augmentation

In [None]:
input_file = ('../Dataset/train.csv')
MODELS_DIR = "../Modelli_BERT-with_data_aug/train-augmented-swap"

### Synonym Replacement

In [None]:
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

#### using BERT

In [None]:
aug_synonym = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="substitute",
    device=device
)

df = pd.read_csv(input_file)
df = df.dropna(subset=['text'])

new_rows = []


print(f"Starting augmentation on {len(df)} rows...")

for index, row in tqdm(df.iterrows(), total=len(df)):
    original_text = row['text']

    if not isinstance(original_text, str) or len(original_text.split()) < 1:
        continue

    try:
        text_syn = aug_synonym.augment(original_text)[0]
        row_syn = row.copy()
        row_syn['text'] = text_syn
        new_rows.append(row_syn)
    except Exception as e:
        print(f"Error: {e}")

df_new = pd.DataFrame(new_rows)
df_final = pd.concat([df, df_new]).reset_index(drop=True)

df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nResults:")
print(f"Original rows: {len(df)}")
print(f"Generated rows: {len(df_new)}")
print(f"Total rows: {len(df_final)}")

report_path = os.path.join(MODELS_DIR, "train_augmented_Swap.csv")
df_final.to_csv(report_path, index=False)
print(f"File saved in: {report_path}")

### Back Translation


#### with LLM (English Dialect -> Polish -> English Dialect)

In [None]:
print(f"Torch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
import bitsandbytes as bnb
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

In [None]:
print(f"CUDA Loaded: {torch.cuda.is_available()}")
print(f"Bnb Version: {bnb.__version__}")

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
df_full = pd.read_csv(input_file).dropna(subset=['text', 'variety'])
df_tot = df_full.sample(frac=1, random_state=42).reset_index(drop=True)[:2000]
df = df_tot[:1000] #augmenting first 1000 rows

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    offload_folder="offload",
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    max_length=None,
    do_sample=True,
    dtype=torch.float16
)

In [None]:
def get_translation_prompt(source_text, direction, variety=None):
    if direction == "en_to_pl":
        # English Dialect -> Polish
        return f"""<s>[INST] Translate the following English text into Polish.
                    Output ONLY the Polish translation, no explanations.

                    English: "{source_text}"
                    Polish: [/INST]"""

    elif direction == "pl_to_en_dialect":
        # Polish -> English Dialect
        dialect_desc = {
            "Australian": "Australian English, using typical slang (e.g., mate, ute, arvo)",
            "Indian": "Indian English, using local nuances",
            "UK": "British English",
            "US": "American English"
        }
        style = dialect_desc.get(variety, "Standard English")

        return f"""<s>[INST] You are a professional translator.
                    Translate the following Polish text back into **{style}**.
                    Do not explain. Output ONLY the English text.

                    Polish: "{source_text}"
                    English: [/INST]"""

In [None]:
new_rows = []

for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
    original_text = row['text']
    variety = row['variety']

    try:
        prompt_1 = get_translation_prompt(original_text, "en_to_pl")

        out_1 = pipe(prompt_1, do_sample=False, return_full_text=False)
        text_polish = out_1[0]['generated_text'].strip().replace('"', '')

        if not text_polish: continue

        prompt_2 = get_translation_prompt(text_polish, "pl_to_en_dialect", variety)

        out_2 = pipe(prompt_2, do_sample=True, temperature=0.7, top_p=0.9, return_full_text=False)
        final_text = out_2[0]['generated_text'].strip().replace('"', '')

        # Saving
        new_row = row.copy()
        new_row['text'] = final_text

        new_rows.append(new_row)

    except Exception as e:
        print(f"Errore riga {index}: {e}")
        continue


In [None]:
# export
df_aug1 = pd.DataFrame(new_rows)
df_final1 = pd.concat([df_full, df_aug1])
df_final1 = df_final1.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nResults:")
print(f"Original rows: {len(df)}")
print(f"Augmented rows: {len(df_aug1)}")
print(f"Total rows: {len(df_final1)}")

report_path = os.path.join(MODELS_DIR, "train_augmented_BT-LLM1.csv")
df_final1.to_csv(report_path, index=False)

In [None]:
df = df_tot[1000:] #augmenting the other 1000 rows

In [None]:
new_rows = []

for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
    original_text = row['text']
    variety = row['variety']

    try:
        prompt_1 = get_translation_prompt(original_text, "en_to_pl")

        out_1 = pipe(prompt_1, do_sample=False, return_full_text=False) # Greedy decoding
        text_polish = out_1[0]['generated_text'].strip().replace('"', '')

        if not text_polish: continue

        prompt_2 = get_translation_prompt(text_polish, "pl_to_en_dialect", variety)

        out_2 = pipe(prompt_2, do_sample=True, temperature=0.7, top_p=0.9, return_full_text=False)
        final_text = out_2[0]['generated_text'].strip().replace('"', '')

        # Saving
        new_row = row.copy()
        new_row['text'] = final_text

        new_rows.append(new_row)

    except Exception as e:
        print(f"Errore riga {index}: {e}")
        continue


In [None]:
# export
df_aug = pd.DataFrame(new_rows)
df_final = pd.concat([df_final1, df_aug])
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nResults")
print(f"Original rows: {len(df)}")
print(f"Augmented rows: {len(df_aug)}")
print(f"Total rows: {len(df_final)}")

report_path = os.path.join(MODELS_DIR, "train_augmented_BT-LLM.csv")
df_final.to_csv(report_path, index=False)