In [None]:
!pip install transformers datasets evaluate sacrebleu nltk --quiet

import pandas as pd
from datasets import Dataset
import nltk
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download('stopwords')
stop_words = set(stopwords.words("turkish"))

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
from datasets import Dataset

# Load with correct separator (Excel often uses semicolons)
df = pd.read_csv("cleaned_paraphrasing_dataset.csv")

# Rename columns to standard names
df = df.rename(columns={"Asil": "input", "Parafraz": "target"})

# Strip extra quotes if needed
df["input"] = df["input"].str.strip('"')
df["target"] = df["target"].str.strip('"')

In [None]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Train/test split
dataset = dataset.train_test_split(test_size=0.1)


In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
print("Veri setinin boyutu:", df.shape)

In [None]:
# Drop rows where input or target is missing
dataset = dataset.filter(lambda x: x["input"] is not None and x["target"] is not None)
dataset = dataset.filter(lambda x: x["input"].strip() != "" and x["target"].strip() != "")


In [None]:
from datasets import Dataset

df = df.dropna()
df = df[['input', 'target']]

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)


In [None]:
df_clean = df.dropna()

In [None]:
from datasets import Dataset

# Temizlenmiş df'in varsa:
dataset = Dataset.from_pandas(df[["input", "target"]])

# Eğitim/doğrulama ayırımı
train_test = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test["train"]
val_dataset = train_test["test"]

In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")

# Doğrudan raw veriyi kullanıyoruz
def tokenize_function(examples):
    inputs = ["parafraze et: " + text for text in examples["input"]]
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=128)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target"], padding="max_length", truncation=True, max_length=128)

    # Replace padding token id's in labels with -100
    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
# ✅ Tokenları görselleştir
def visualize_tokenization(example_input):
    text = "paraphrase: " + example_input
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    print("\n🧾 Input Text:", text)
    print("🔤 Tokens:", tokens)
    print("🔢 Token IDs:", token_ids)

# Örnek veri üzerinde test et
visualize_tokenization(train_dataset[0]["input"])

# Dataset'i tokenize et
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

In [None]:
# 📌 Aynı input ve target'a sahip satırları bul
duplicate_rows = df[df["input"] == df["target"]]

# 🔁 Bu satırların sayısını yazdır
print("🔁 Aynı input ve target'a sahip satır sayısı:", len(duplicate_rows))

# ✅ Bu satırları veri setinden çıkar
df_cleaned = df[df["input"] != df["target"]]

# 📊 Kalan toplam satır sayısını yazdır
print("✅ Temizlenmiş veri setindeki toplam satır sayısı:", len(df_cleaned))


In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, TrainingArguments, Trainer

# ✅ Use multilingual mT5 model
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")

training_args = TrainingArguments(
    output_dir="./mt5_paraphraser_tr",
    eval_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs",
    fp16=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer
)


In [None]:
trainer.model.to("cpu")
trainer.train()

In [None]:
# 🔄 Switch model to CPU to prevent CUDA OutOfMemory errors
import torch
trainer.model.to("cpu")

# ✅ Take a small sample from the validation set
small_eval_set = tokenized_val.select(range(1000))  # adjust size if needed

# ✅ Perform safe inference without Trainer.evaluate()
from tqdm import tqdm
from transformers import T5ForConditionalGeneration, T5Tokenizer

predictions = []
references = []

model = trainer.model
tokenizer = trainer.tokenizer
model.eval()

for sample in tqdm(small_eval_set):
    input_text = "paraphrase: " + sample["input"]
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).input_ids

    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=128, num_beams=5)

    pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    predictions.append(pred.strip())
    references.append(sample["target"].strip())


In [None]:
!pip install rouge_score
import evaluate
import torch, gc

# ✅ Compute BLEU and ROUGE scores
import evaluate
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

bleu_score = bleu.compute(predictions=predictions, references=[[r] for r in references])
rouge_score = rouge.compute(predictions=predictions, references=references)

print("✅ Evaluation Complete")
print("BLEU Score:", bleu_score["score"])
print("ROUGE-L Score:", rouge_score["rougeL"])


In [None]:
from huggingface_hub import login

# Authenticate first (only once per session)
login()  # paste token from huggingface.co/settings/tokens


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = "/content/mt5_paraphraser_tr/checkpoint-2649"

tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

def paraphrase(text, num_return_sequences=1):
    input_text = f"parafraze et: {text}"
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=128).input_ids

    outputs = model.generate(
        input_ids,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.9,
        max_length=128,
        num_return_sequences=num_return_sequences
    )
    return [tokenizer.decode(o, skip_special_tokens=True) for o in outputs]




In [None]:
# Test
print(paraphrase("Film gerçekten çok güzeldi."))

In [None]:
print(paraphrase("bugün güzel bir gün geçirdim."))

In [None]:
print(paraphrase("Uçakla yolculuk yapmayı seviyorum."))

In [None]:
print(paraphrase("kitap okumak insanın bilgisini arttırır."))

In [None]:
import os

print(os.path.exists("/content/mt5_paraphraser_tr/checkpoint-2649"))
