In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, get_scheduler
from torch.optim import AdamW
import pandas as pdf
from datasets import Dataset
import pandas as pd
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import nltk
from tqdm.auto import tqdm
import torch.nn.functional as F

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/mamdouh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def load_data(en_file, tr_file):
    en_file = "data_directories/final_data/" + en_file
    tr_file = "data_directories/final_data/" + tr_file
    # en_file = "archive/" + en_file
    # tr_file = "archive/" + tr_file
    with open(en_file, 'r', encoding='utf-8') as file:
        en_texts = file.read().strip().split('\n')
    with open(tr_file, 'r', encoding='utf-8') as file:
        tr_texts = file.read().strip().split('\n')
    return pd.DataFrame({'en': en_texts, 'tr': tr_texts})

# train_data = load_data('new_combined_data_english.txt', "new_combined_data_akkadian.txt")
train_data = load_data("english_train.txt", "akkadian_train.txt")


# Train paraphraser

In [1]:
from datasets import load_dataset
import pandas as pd

# Download PAWS English
df_orig = pd.read_csv("/mnt/c/Users/user/Downloads/parabank-2.0/parabank2.tsv",sep='\t', header=None, names=['score','input', 'target']
                ,usecols=[0,1,2])




In [4]:
df = df_orig.copy()

In [5]:
df = df[df['score'] >= 0.45].dropna()[['input', 'target']]


In [5]:
from transformers import MarianTokenizer, MarianMTModel

model_checkpoint = "Helsinki-NLP/opus-mt-ROMANCE-en"
tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)
model = MarianMTModel.from_pretrained(model_checkpoint)


In [6]:
from datasets import Dataset

def tokenize_paraphrase_batch(batch, tokenizer=tokenizer, max_length=200):
    model_inputs = tokenizer(batch['input'], max_length=max_length, truncation=True, padding="max_length")
    labels = tokenizer(batch['target'], max_length=max_length, truncation=True, padding="max_length")['input_ids']
    model_inputs['labels'] = labels
    return model_inputs

# Use debug_df or full_df as needed
hf_dataset = Dataset.from_pandas(df)  # or Dataset.from_pandas(full_df)

tokenized_dataset = hf_dataset.map(lambda batch: tokenize_paraphrase_batch(batch, tokenizer), batched=True)


Map:   0%|          | 0/625408 [00:00<?, ? examples/s]

In [7]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

In [8]:
def save_model(model, tokenizer, optimizer, dataset_id):
    import os
    import json
    import torch

    save_path = "models/" + dataset_id
    os.makedirs(save_path, exist_ok=True)

    # Save model and tokenizer
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    # Save optimizer state only
    torch.save(optimizer.state_dict(), os.path.join(save_path, "optimizer.pt"))


In [11]:
from transformers import Seq2SeqTrainingArguments
from transformers import TrainerCallback


training_args = Seq2SeqTrainingArguments(
    output_dir="./paraphraser",
    eval_strategy="steps",
    eval_steps=10000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,  # Use 2 for debug, increase as needed
    predict_with_generate=True,
    fp16=True,  # Set to True if using a GPU that supports it
    logging_steps=10000,
    report_to='none'
)

class SaveEveryNEpochsCallback(TrainerCallback):
    def __init__(self, tokenizer, n_epochs=2):
        self.n_epochs = n_epochs
        self.tok = tokenizer

    def on_epoch_end(self, args, state, control, **kwargs):
        model, optim = kwargs["model"], kwargs["optimizer"]
        if int(state.epoch) % self.n_epochs == 0:
            save_model(model, self.tok, optim, "paraphraser")
        return control

In [12]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[SaveEveryNEpochsCallback(tokenizer, n_epochs=2)]
)


  trainer = Seq2SeqTrainer(


In [13]:
trainer.train()

Step,Training Loss,Validation Loss
10000,0.1696,0.125112
20000,0.1287,0.110452
30000,0.1172,0.103811
40000,0.1096,0.099544
50000,0.1032,0.097077
60000,0.101,0.094837
70000,0.0995,0.092848
80000,0.0948,0.092061
90000,0.093,0.090958
100000,0.0922,0.08999




TrainOutput(global_step=185670, training_loss=0.0995174711828067, metrics={'train_runtime': 18715.6379, 'train_samples_per_second': 158.727, 'train_steps_per_second': 9.921, 'total_flos': 1.57345881587712e+17, 'train_loss': 0.0995174711828067, 'epoch': 5.0})

In [26]:
paraphrase_model = trainer.model

In [14]:
import torch
def paraphrase_sentences(sentences, model, tokenizer, max_length=200):
    model.eval()
    device = ("cuda" if torch.cuda.is_available() else "cpu")
    enc = tokenizer(sentences, return_tensors="pt", truncation=True, padding=True, max_length=max_length).to(device)
    with torch.no_grad():
        outputs = model.generate(**enc, num_beams=5, max_length=max_length)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(paraphrase_sentences(['The NBA season of 1975 -- 76 was the 30th season of the National Basketball Association .'
], model, tokenizer))


["1975's NBA season -- 76 was the 30th season of the National Basketball Association ."]


In [15]:
for s in [
    "The quick brown fox jumps over the lazy dog.",
    "He went to the market.",
    "This is an interesting book.",
    "I am very happy today.",
    "The weather is great!"
]:
    print(paraphrase_sentences([s], model, tokenizer))


['Fast brown fox jumps over a lazy dog.']
['He went on the market.']
["That's an intriguing book."]
["Today I'm very happy."]
["Weather's great!"]


# Augment data

In [15]:
p_paraphrase       = 0.5      # 50 % chance we call the paraphraser for a row
k_paraphrases      = 2        # up to 2 alternatives per sentence
DEVICE = 'cuda'
PARA_MODEL = 'models/paraphraser'
# ──────────────────────────────────────────────────────────────────────────────
#  1 · Load the paraphraser
# ──────────────────────────────────────────────────────────────────────────────
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch, pandas as pd, numpy as np
from tqdm.auto import tqdm

tok_para = AutoTokenizer.from_pretrained(PARA_MODEL)
mdl_para = AutoModelForSeq2SeqLM.from_pretrained(PARA_MODEL).to(DEVICE).eval()

def paraphrase(text, num_return=k_paraphrases):
    """Return ≤k unique paraphrases for *text* (may be fewer if duplicates)."""
    enc    = tok_para(text, return_tensors="pt", truncation=True,
                      max_length=200).to(DEVICE)
    with torch.no_grad():
        outs = mdl_para.generate(
            **enc,
            num_beams=10,
            num_return_sequences=num_return,
            temperature=1.0,
            diversity_penalty=0.0,
            no_repeat_ngram_size=3,
            max_length=200,
        )
    paras = {tok_para.decode(o, skip_special_tokens=True) for o in outs}
    paras.discard(text)                    # don’t keep identical copy
    return list(paras)

# ──────────────────────────────────────────────────────────────────────────────
#  2 · Augment the training DataFrame
# ──────────────────────────────────────────────────────────────────────────────
aug_rows = []           # list of dicts → pd.DataFrame later

rng = np.random.default_rng(42)

for _, row in tqdm(train_data.iterrows(), total=len(train_data),
                   desc="Paraphrasing"):
    tr_sent = row["tr"]
    en_sent = row["en"]

    # always include the original pair
    aug_rows.append({"tr": tr_sent, "en": en_sent})

    # flip a biased coin
    if rng.random() < p_paraphrase:
        paras = paraphrase(en_sent, num_return=k_paraphrases)
        for p in paras:
            aug_rows.append({"tr": tr_sent, "en": p})

# build the new DataFrame
train_aug = pd.DataFrame(aug_rows).reset_index(drop=True)
print("New train size:", len(train_aug))

Paraphrasing:   0%|          | 0/38040 [00:00<?, ?it/s]

New train size: 74449


In [19]:
train_aug['en'].to_csv("data_directories/final_data/new_combined_paraphrased_english.txt", index=False)
train_aug['tr'].to_csv("data_directories/final_data/new_combined_paraphrased_akkadian.txt", index=False)