In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install transformers datasets accelerate peft trl bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import torch

MAX_SEQ_LEN = 2048
LOAD_IN_4BIT = True
# MODEL_NAME = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
MODEL_NAME = "unsloth/gemma-7b-it-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LEN,
    dtype=None,
    load_in_4bit=LOAD_IN_4BIT,
)

# load dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

dataset = pd.read_excel('/content/drive/MyDrive/datasets/News.xlsx')
dataset.head()


In [None]:
from sklearn.model_selection import train_test_split

dataset = dataset.drop(columns=['Unnamed: 0'])
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Zero shot

In [None]:
from transformers import pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)

In [None]:
max_model_tokens = 1998
max_new_tokens = 50

def truncate_prompt(tokenizer, text, max_model_tokens=max_model_tokens, max_new_tokens=max_new_tokens):
    # تبدیل متن به توکن
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    # اگر طول ورودی زیاد است، فقط آخر متن را نگه دار
    if input_ids.shape[-1] + max_new_tokens > max_model_tokens:
        truncated_ids = input_ids[:, -(max_model_tokens - max_new_tokens):]
        text = tokenizer.decode(truncated_ids[0], skip_special_tokens=True)

    return text


In [None]:
import re

def clean_output_title(text):
    match = re.split(r"### title:", text, flags=re.IGNORECASE)
    if len(match) > 1:
        return match[-1].strip()
    return text.strip()


In [None]:
# zero shot fo i-th of test dataset
def zero_shot(i):
  print("real title: " + test_data['title'].iloc[i])
  content = test_data['content'].iloc[i]

  prompt1 = f"""The text below is a news content. Your task is to write a short, clear, and relevant Persian title that summarizes the main idea of the article.
  Generate only one Persion title and do not include any translations or extra text.  News content is:\n\n"
              "{content}\n\ntitle:"""

  result = pipe(prompt1, max_new_tokens=50, temperature=0.6, do_sample=True)
  print("Result for prompt 1: " + clean_output_title(result[0]["generated_text"]))

  prompt2 = f"""Write a title in Persian for the news article below.
  Do not include any English words, translations, or extra explanations. News content is:\n\n"
              "{content}\n\ntitle:"""

  result = pipe(prompt2, max_new_tokens=50, temperature=0.6, do_sample=True)
  print("Result for prompt 2: " + clean_output_title(result[0]["generated_text"]))

  prompt3 = f"""You are a journalist. Your task is to read the news content below and write a single, clear, and concise title **in Persian** that best summarizes the main idea.
  Do not include any English words, translations, or extra text. News content is:\n\n"
              "{content}\n\ntitle:"""

  result = pipe(prompt3, max_new_tokens=50, temperature=0.6, do_sample=True)
  print("Result for prompt 3: " + clean_output_title(result[0]["generated_text"]))

zero_shot(0)
zero_shot(1)


# Few shot

In [None]:

example_1 = f"News content:\n{train_data['content'].iloc[0]}\nTitle:\n{train_data['title'].iloc[0]}"
example_2 = f"News content:\n{train_data['content'].iloc[1]}\nTitle:\n{train_data['title'].iloc[1]}"

examples_text = f"{example_1}\n\n{example_2}\n\nNow write the title for the next news:\n\n"


def few_shot(i):
    content = test_data['content'].iloc[i]
    real_title = test_data['title'].iloc[i]

    prompt1 = f"""The text below is a news content. Your task is to write a short, clear, and relevant Persian title that summarizes the main idea of the article.
Generate only one Persian title and do not include any translations or extra text. Two prepared examples are provided below. Study them carefully and write your output in the same style.
Examples:\n
{examples_text}\n\n
News content:\n\n{content}\n\ntitle:"""
    prompt1 = truncate_prompt(tokenizer, prompt1)
    result = pipe(prompt1, max_new_tokens=50, temperature=0.6, do_sample=True)
    print("Result for prompt 1: " + clean_output_title(result[0]["generated_text"]))

    prompt2 = f"""Write a title in Persian for the news article below.
Do not include any English words, translations, or extra explanations.Two prepared examples are provided below. Study them carefully and write your output in the same style.
Examples:\n
{examples_text}\n\n
News content:\n\n{content}\n\ntitle:"""

    prompt2 = truncate_prompt(tokenizer, prompt2)
    result = pipe(prompt2, max_new_tokens=50, temperature=0.6, do_sample=True)
    print("Result for prompt 2: " + clean_output_title(result[0]["generated_text"]))

    prompt3 = f"""You are a journalist. Your task is to read the news content below and write a single, clear, and concise title **in Persian** that best summarizes the main idea.
Do not include any English words, translations, or extra text.Two prepared examples are provided below. Study them carefully and write your output in the same style.
Examples:\n
{examples_text}\n\n
News content:\n\n{content}\n\ntitle:"""

    prompt3 = truncate_prompt(tokenizer, prompt3)
    result = pipe(prompt3, max_new_tokens=50, temperature=0.6, do_sample=True)
    print("Result for prompt 3: " + clean_output_title(result[0]["generated_text"]))



few_shot(0)
few_shot(1)



# Fine Tune

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha = 16,
    lora_dropout = 0.0,
    bias = "none",
    random_state=3407,
)

In [None]:
from datasets import Dataset

def prepare_data(df):
    df["prompt"] = df["content"].apply(
        lambda x: f"""### Instruction:\nGenerate a short and suitable Persian title for the following news content.
         Examples for train include input and
        what you should output are here: \n\n### input:\n{x}\n\n### output:"""
    )
    df["response"] = df["title"].apply(lambda x: f" {x.strip()}")
    return df[["prompt", "response"]]


train_df = prepare_data(train_data)
test_df = prepare_data(test_data)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from trl import SFTTrainer
from transformers import TrainingArguments
from torch.cuda import is_bf16_supported

MAX_SEQ_LEN = 2048

EOS_TOKEN = tokenizer.eos_token

def formatting_func(examples):
    texts = []
    for prompt, response in zip(examples["prompt"], examples["response"]):
        texts.append(f"{prompt.strip()} {response.strip()}{EOS_TOKEN}")
    return texts



training_args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 8,
    warmup_steps = 5,
    max_steps = 113,
    learning_rate = 1e-5,
    fp16 = not is_bf16_supported(),
    bf16 = is_bf16_supported(),
    logging_steps = 1,
    output_dir = "outputs",
    optim = "paged_adamw_8bit",
    save_total_limit = 2,
)


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    max_seq_length = MAX_SEQ_LEN,
    args = training_args,
    formatting_func = formatting_func,
)

trainer.train()


In [None]:

def generate_title_finetuned(model, tokenizer, news_text, max_len=50, temperature=0.7):
    inputs = tokenizer(news_text, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_len,
        do_sample=True,
        temperature=temperature,
        top_p=0.9,
        top_k=50,
        eos_token_id=tokenizer.eos_token_id
    )

    # title = tokenizer.decode(outputs[0], skip_special_tokens=True)
    title = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return title



In [None]:
import re

def clean_output(text):
    match = re.split(r"### output:", text, flags=re.IGNORECASE)
    if len(match) > 1:
        return match[-1].strip()
    return text.strip()


In [None]:


from tqdm import tqdm

generated_titles = []

for prompt in tqdm(test_df["prompt"], desc="Generating titles"):
    prompt = truncate_prompt(tokenizer, prompt)
    title = generate_title_finetuned(model, tokenizer, prompt)

    if isinstance(title, list):
        text = title[0]
    else:
        text = title

    clean_title = clean_output(text)
    generated_titles.append(clean_title)

test_df["generated_title"] = generated_titles

print(test_df[["prompt", "response", "generated_title"]].head())


In [None]:

!pip install evaluate
!pip install rouge_score


In [None]:
import evaluate

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

references = test_df["response"].tolist()
predictions = test_df["generated_title"].tolist()

bleu_result = bleu.compute(predictions=predictions, references=references)

rouge_result = rouge.compute(predictions=predictions, references=references)

meteor_result = meteor.compute(predictions=predictions, references=references)

print("BLEU:", bleu_result["bleu"])
print("ROUGE-L:", rouge_result["rougeL"])
print("METEOR:", meteor_result["meteor"])


In [None]:
test_df.to_csv("/content/gemma_results.csv", index=False, encoding="utf-8-sig")