STEP 1: INSTALL LIBRARIES, SETUP ENVIRONMENT & CONNECT DRIVE


In [1]:
# Install the required libraries for model training and evaluation
!pip install transformers datasets evaluate rouge_score sacrebleu accelerate -U

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00

In [None]:
# Import Standard Libraries and Connect Drive

# Import Standard Data Science Libraries
# Note: AI libraries (transformers, datasets, torch) will be imported later in the training section
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Configure Visualization Settings
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)

# Connect to Google Drive
from google.colab import drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

print("Environment setup and Drive connection complete.")

Mounting Google Drive...
Mounted at /content/drive
Environment setup and Drive connection complete.


STEP 2: LOAD DATASET & PERFORM COLUMN CHECKS

In [None]:
# 2.1 Define the File Path
# UPDATE THIS PATH to point to your specific file in Google Drive
file_path = '/content/drive/My Drive/arXiv_scientific dataset.csv'

# 2.2 Load the Data
try:
    df = pd.read_csv(file_path)
    print(f"Data loaded successfully from: {file_path}")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the path.")

# ---------------------------------------------------------
# 2.3 DETAILED COLUMN & MISSING VALUE CHECK
# ---------------------------------------------------------

print("\n--- Column Information and Data Types (df.info()) ---")
df.info()

print("\n--- Column Names and Unique Values Count ---")
# Create a summary of columns, data types, and unique value counts
column_summary = pd.DataFrame({
    'Column Name': df.columns,
    'Data Type': df.dtypes,
    'Unique Values': [df[col].nunique() for col in df.columns]
})
display(column_summary)

print("\n--- Missing Value Summary (isnull().sum()) ---")
# Calculate the number of missing values per column
missing_values = df.isnull().sum()

# Filter and display only columns that have missing values
missing_values_to_show = missing_values[missing_values > 0].sort_values(ascending=False)

if missing_values_to_show.empty:
    print("No missing values found in the dataset.")
else:
    print("Columns with Missing Values:")
    display(missing_values_to_show.to_frame(name='Missing Count'))
    print(f"\nTotal rows in dataset: {len(df)}")

Data loaded successfully from: /content/drive/My Drive/arXiv_scientific dataset.csv

--- Column Information and Data Types (df.info()) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136238 entries, 0 to 136237
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   id                  136238 non-null  object
 1   title               136238 non-null  object
 2   category            136238 non-null  object
 3   category_code       136238 non-null  object
 4   published_date      136238 non-null  object
 5   updated_date        136238 non-null  object
 6   authors             136238 non-null  object
 7   first_author        136238 non-null  object
 8   summary             136238 non-null  object
 9   summary_word_count  136238 non-null  int64 
dtypes: int64(1), object(9)
memory usage: 10.4+ MB

--- Column Names and Unique Values Count ---


Unnamed: 0,Column Name,Data Type,Unique Values
id,id,object,136238
title,title,object,136154
category,category,object,138
category_code,category_code,object,139
published_date,published_date,object,7259
updated_date,updated_date,object,7196
authors,authors,object,125548
first_author,first_author,object,77742
summary,summary,object,136193
summary_word_count,summary_word_count,int64,346



--- Missing Value Summary (isnull().sum()) ---
No missing values found in the dataset.


STEP 3: DATA PREPROCESSING FOR TITLE GENERATION

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd

# ---------------------------------------------------------
# Configuration
# ---------------------------------------------------------
# Prefix required by T5 model to understand the summarization task
T5_PREFIX = "summarize: "
TEST_SIZE = 0.1
VAL_SIZE = 0.1

# ---------------------------------------------------------
# 1. Data Selection and Cleaning
# ---------------------------------------------------------
if 'df' not in locals():
    print("Error: DataFrame 'df' not found. Please run Step 2 (Load Data) first.")
else:
    print(f"Original Data Shape: {df.shape}")

    # Select only the input (summary) and target (title) columns
    df_gen = df[['summary', 'title']].copy()

    # Simple data cleaning: Drop rows with missing values
    rows_before_drop = len(df_gen)
    df_gen.dropna(inplace=True)

    # Remove rows where abstracts or titles are too short (likely noise/invalid data)
    df_gen = df_gen[df_gen['summary'].str.len() > 50]
    df_gen = df_gen[df_gen['title'].str.len() > 5]

    rows_after_drop = len(df_gen)
    print(f"Removed {rows_before_drop - rows_after_drop} rows due to missing or short content.")
    print(f"Remaining Data Shape: {df_gen.shape}")

    # ---------------------------------------------------------
    # 2. Add T5 Prefix (Crucial for T5 Model)
    # ---------------------------------------------------------
    # The T5 model expects a task prompt, here it's "summarize: "
    df_gen['summary'] = T5_PREFIX + df_gen['summary']

    print(f"Sample Input (with prefix): {df_gen.iloc[0]['summary'][:100]}...")
    print(f"Sample Target: {df_gen.iloc[0]['title']}")

    # ---------------------------------------------------------
    # 3. Convert to Hugging Face Dataset & Split
    # ---------------------------------------------------------
    raw_datasets = Dataset.from_pandas(df_gen)

    # Remove index column if exists
    if '__index_level_0__' in raw_datasets.column_names:
        raw_datasets = raw_datasets.remove_columns('__index_level_0__')

    # Split: Train (80%), Validation (10%), Test (10%)
    # Split 1: Extract Test set (10%)
    train_test = raw_datasets.train_test_split(test_size=TEST_SIZE, seed=42)

    # Split 2: Extract Validation set (10% of total ~= 11% of remaining)
    adjusted_val_size = VAL_SIZE / (1 - TEST_SIZE)
    test_valid = train_test['train'].train_test_split(test_size=adjusted_val_size, seed=42)

    # Combine into final structure
    dataset = DatasetDict({
        'train': test_valid['train'],
        'validation': test_valid['test'],
        'test': train_test['test']
    })

    print("\n DATA PREPROCESSING COMPLETE")
    print("--- Final Dataset Split Structure ---")
    print(dataset)

Original Data Shape: (136238, 10)
Removed 14 rows due to missing or short content.
Remaining Data Shape: (136224, 2)
Sample Input (with prefix): summarize: Because of their occasional need to return to shallow points in a search
tree, existing b...
Sample Target: Dynamic Backtracking

 DATA PREPROCESSING COMPLETE
--- Final Dataset Split Structure ---
DatasetDict({
    train: Dataset({
        features: ['summary', 'title'],
        num_rows: 108978
    })
    validation: Dataset({
        features: ['summary', 'title'],
        num_rows: 13623
    })
    test: Dataset({
        features: ['summary', 'title'],
        num_rows: 13623
    })
})


STEP 4: TOKENIZATION FOR T5

In [None]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq

# ---------------------------------------------------------
# Configuration
# ---------------------------------------------------------
# We use "t5-small" as per Step 3 config
MODEL_CHECKPOINT = "t5-small"

# Context length settings
MAX_INPUT_LENGTH = 512  # Max length for Abstracts (Inputs)
MAX_TARGET_LENGTH = 128 # Max length for Titles (Targets)

# ---------------------------------------------------------
# 1. Load Tokenizer
# ---------------------------------------------------------
print(f"Loading tokenizer from {MODEL_CHECKPOINT}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# ---------------------------------------------------------
# 2. Define Preprocessing Function
# ---------------------------------------------------------
def preprocess_function(examples):
    # 1. Tokenize Inputs (Summaries)
    # We strip spaces just in case
    inputs = [doc.strip() for doc in examples["summary"]]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True
    )

    # 2. Tokenize Targets (Titles)
    # The 'labels' are the tokenized titles
    targets = [doc.strip() for doc in examples["title"]]
    labels = tokenizer(
        text_target=targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# ---------------------------------------------------------
# 3. Apply Tokenization to Dataset
# ---------------------------------------------------------
print("Tokenizing dataset (This process handles both Input and Output)...")

# batched=True processes multiple texts at once for speed
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names # Remove raw text columns ('summary', 'title') to save RAM
)

print("\n TOKENIZATION COMPLETE")
print("--- Final Tokenized Structure ---")
print(tokenized_datasets)

# ---------------------------------------------------------
# 4. Create Data Collator (Dynamic Padding)
# ---------------------------------------------------------
# This is crucial for Seq2Seq training. It handles padding for both inputs and labels dynamically.
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=MODEL_CHECKPOINT)

print("\n--- Sample Processed Data ---")
sample = tokenized_datasets['train'][0]
print(f"Input IDs shape: {len(sample['input_ids'])}")
print(f"Labels shape: {len(sample['labels'])}")
print(f"Decoded Input: {tokenizer.decode(sample['input_ids'])}")
print(f"Decoded Label: {tokenizer.decode(sample['labels'])}")

Loading tokenizer from t5-small...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Tokenizing dataset (This process handles both Input and Output)...


Map:   0%|          | 0/108978 [00:00<?, ? examples/s]

Map:   0%|          | 0/13623 [00:00<?, ? examples/s]

Map:   0%|          | 0/13623 [00:00<?, ? examples/s]


 TOKENIZATION COMPLETE
--- Final Tokenized Structure ---
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 108978
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13623
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13623
    })
})

--- Sample Processed Data ---
Input IDs shape: 189
Labels shape: 22
Decoded Input: summarize: We propose a dense object detector with an instance-wise sampling strategy, named IQDet. Instead of using human prior sampling strategies, we first extract the regional feature of each ground-truth to estimate the instance-wise quality distribution. According to a mixture model in spatial dimensions, the distribution is more noise-robust and adapted to the semantic pattern of each instance. Based on the distribution, we propose a quality sampling strategy, which automatically selec

STEP 5: MODEL SETUP & TRAINING

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np
import nltk
import math # Import math to check for NaNs

# Ensure nltk resources are downloaded
nltk.download("punkt")
nltk.download("punkt_tab") # Added this line to download the missing resource

# ---------------------------------------------------------
# 1. Load Pre-trained Model
# ---------------------------------------------------------
print(f"Loading model: {MODEL_CHECKPOINT}...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

# ---------------------------------------------------------
# 2. Define Metrics (With Error Handling)
# ---------------------------------------------------------
metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # --- FIX START: Ensure predictions are valid token IDs ---
    # Convert predictions to a numpy array (if not already)
    predictions = np.array(predictions)
    # Get the vocabulary size of the tokenizer
    vocab_size = tokenizer.vocab_size
    # Clip predictions to be within the valid token ID range [0, vocab_size - 1]
    # and ensure they are of integer type to prevent OverflowError during decoding.
    predictions = np.clip(predictions, 0, vocab_size - 1).astype(int)
    # --- FIX END ---

    # Decode generated summaries
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE preparation
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Compute ROUGE
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Convert to percentage
    result = {key: value * 100 for key, value in result.items()}

    # Add generation length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    # --- SAFETY FIX FOR OVERFLOW ERROR ---
    final_result = {}
    for k, v in result.items():
        # Check if value is Infinity or NaN
        if math.isnan(v) or math.isinf(v):
            final_result[k] = 0.0 # Set to 0 if error
        else:
            final_result[k] = round(v, 4)

    return final_result

# ---------------------------------------------------------
# 3. Training Arguments (FP32 - Stable Mode)
# ---------------------------------------------------------
batch_size = 8
num_epochs = 3

args = Seq2SeqTrainingArguments(
    output_dir="./t5-title-generator",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,

    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=num_epochs,
    predict_with_generate=True,

    # --- KEY FIX HERE: DISABLE FP16 ---
    fp16=False,                     # Tắt chế độ 16-bit để tránh lỗi Overflow
    # ----------------------------------

    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="none"
)

# ---------------------------------------------------------
# 4. Initialize Trainer
# ---------------------------------------------------------
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ---------------------------------------------------------
# 5. START TRAINING
# ---------------------------------------------------------
print("Starting training (FP32 Stable Mode)...")
trainer.train()

print("\nTraining Complete! Saving model...")
trainer.save_model("./final_t5_model")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Loading model: t5-small...


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(


Starting training (FP32 Stable Mode)...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.2205,2.02842,43.3278,24.2933,38.776,38.7943,15.0414


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.2205,2.02842,43.3278,24.2933,38.776,38.7943,15.0414
2,2.1464,1.969573,44.0788,24.9075,39.4,39.4128,15.0754
3,2.1189,1.95986,44.1555,24.9388,39.4882,39.4899,15.1098


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



Training Complete! Saving model...


Save model

In [None]:
import os

# Define the directory path on Google Drive where you want to save the model
# '/content/drive/MyDrive/' is the root folder of your Drive
model_save_path_drive = '/content/drive/MyDrive/mymodel'

# Create the directory if it doesn't exist
if not os.path.exists(model_save_path_drive):
    os.makedirs(model_save_path_drive)
    print(f"Created directory: {model_save_path_drive}")

try:
    # Save the model and tokenizer using save_pretrained()
    # This saves the model's weights and configuration, and the tokenizer's vocabulary.
    model.save_pretrained(model_save_path_drive)
    tokenizer.save_pretrained(model_save_path_drive)
    print(f"Model and tokenizer successfully saved to: {model_save_path_drive}")
except Exception as e:
    print(f"Error saving model and tokenizer to Drive: {e}")


Created directory: /content/drive/MyDrive/mymodel
Model and tokenizer successfully saved to: /content/drive/MyDrive/mymodel


STEP 6: EVALUATION (From Saved Model)

Load model was saved from drive

In [2]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import os
from google.colab import drive

# STEP 1: Mount Google Drive
# This is required to access the saved file in your Drive
drive.mount('/content/drive')

# STEP 2: Define the file path where the model was saved on Drive
# Ensure this path matches exactly where you saved the model (from step rFG4R7t1ApFK)
model_save_path_drive = '/content/drive/MyDrive/mymodel'

print(f"Loading model from: {model_save_path_drive}")

try:
    # Load the model and tokenizer using the transformers library
    loaded_model = AutoModelForSeq2SeqLM.from_pretrained(model_save_path_drive)
    loaded_tokenizer = AutoTokenizer.from_pretrained(model_save_path_drive)
    print("Model and Tokenizer loaded successfully.")

    # Optionally, move the model to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    loaded_model.to(device)
    print(f"Model moved to: {device.upper()}")

    # Now you can use loaded_model and loaded_tokenizer for inference/evaluation
    # For example, to print model config:
    # print(loaded_model.config)

except Exception as e:
    print(f"Error loading model: {e}")
    print("Please check if the directory exists and contains a valid Hugging Face model.")

# STEP 3: The model is now ready to use (loaded into loaded_model and loaded_tokenizer)


Mounted at /content/drive
Loading model from: /content/drive/MyDrive/mymodel
Model and Tokenizer loaded successfully.
Model moved to: CUDA


EVALUATION the model

In [3]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import Dataset, load_dataset
import evaluate
from tqdm import tqdm
import pandas as pd
import random
import nltk
from google.colab import drive

# Ensure NLTK resources are available
nltk.download("punkt")
nltk.download("punkt_tab")

# ---------------------------------------------------------
# STEP 1: Connect to Google Drive & Define Paths
# ---------------------------------------------------------
drive.mount('/content/drive')

# Path where you saved the model in Step 15 of your notebook
MODEL_PATH = '/content/drive/MyDrive/mymodel'
# Path to your original CSV file
CSV_FILE_PATH = '/content/drive/My Drive/arXiv_scientific dataset.csv'

print(f"Loading model from: {MODEL_PATH}")
print(f"Loading data from: {CSV_FILE_PATH}")

# ---------------------------------------------------------
# STEP 2: Load Model and Tokenizer
# ---------------------------------------------------------
try:
    model_eval = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
    tokenizer_eval = AutoTokenizer.from_pretrained(MODEL_PATH)

    # Move to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_eval.to(device)
    print(f"Model loaded successfully! Running on: {device.upper()}")
except Exception as e:
    print(f"Error loading model: {e}")
    raise

# ---------------------------------------------------------
# STEP 3: Re-create Test Dataset
# ---------------------------------------------------------
# We need to recreate the test split exactly as done in training to avoid data leakage
print("Reloading and processing dataset...")

# Load CSV
df = pd.read_csv(CSV_FILE_PATH)

# Basic cleaning (Same as training step)
df = df[['summary', 'title']].dropna()
df = df[df['summary'].str.len() > 50]
df = df[df['title'].str.len() > 5]

# Add T5 Prefix
T5_PREFIX = "summarize: "
df['summary'] = T5_PREFIX + df['summary']

# Convert to Hugging Face Dataset
raw_datasets = Dataset.from_pandas(df)
if '__index_level_0__' in raw_datasets.column_names:
    raw_datasets = raw_datasets.remove_columns('__index_level_0__')

# Perform the EXACT same split as training to get the correct Test set
# Split: Train (90%) / Test (10%) - seed must be 42
train_test = raw_datasets.train_test_split(test_size=0.1, seed=42)
test_dataset = train_test['test']

print(f"Test dataset prepared. Number of samples: {len(test_dataset)}")

# ---------------------------------------------------------
# STEP 4: Define Evaluation Metrics
# ---------------------------------------------------------
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")

# ---------------------------------------------------------
# STEP 5: Define Generation Function
# ---------------------------------------------------------
def generate_title(abstract, model, tokenizer):
    """
    Generates a title from an abstract using the loaded model.
    """
    inputs = tokenizer(
        abstract,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=64,
            min_length=5,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ---------------------------------------------------------
# STEP 6: Run Evaluation Loop
# ---------------------------------------------------------
print("\n--- STARTING EVALUATION ---")

# Evaluate on a subset (e.g., 100 samples) to save time, or use len(test_dataset) for full
EVAL_SAMPLES = 100
subset_test = test_dataset.shuffle(seed=42).select(range(EVAL_SAMPLES))

predictions = []
references = []

for sample in tqdm(subset_test, desc="Generating Titles"):
    # Input text (Abstract)
    abstract_text = sample['summary']
    # Ground truth (Real Title)
    real_title = sample['title']

    # Generate prediction
    pred_title = generate_title(abstract_text, model_eval, tokenizer_eval)

    predictions.append(pred_title)
    references.append(real_title)

# ---------------------------------------------------------
# STEP 7: Calculate and Display Metrics
# ---------------------------------------------------------
rouge_score = rouge_metric.compute(predictions=predictions, references=references)
bleu_score = bleu_metric.compute(predictions=predictions, references=[[r] for r in references])

print("\n=============================================")
print("           FINAL EVALUATION RESULTS          ")
print("=============================================")
print(f"ROUGE-1: {rouge_score['rouge1']*100:.2f}%")
print(f"ROUGE-2: {rouge_score['rouge2']*100:.2f}%")
print(f"ROUGE-L: {rouge_score['rougeL']*100:.2f}%")
print(f"BLEU:    {bleu_score['bleu']*100:.2f}%")
print("=============================================")

# ---------------------------------------------------------
# STEP 8: Show Examples
# ---------------------------------------------------------
print("\n--- QUALITATIVE EXAMPLES ---")
for i in range(5):
    idx = random.randint(0, len(predictions)-1)
    print(f"\nExample {i+1}:")
    print(f"Abstract (Input): {subset_test[idx]['summary'].replace('summarize: ', '')[:200]}...")
    print(f"Real Title:       {references[idx]}")
    print(f"Generated Title:  {predictions[idx]}")
    print("-" * 50)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading model from: /content/drive/MyDrive/mymodel
Loading data from: /content/drive/My Drive/arXiv_scientific dataset.csv
Model loaded successfully! Running on: CUDA
Reloading and processing dataset...
Test dataset prepared. Number of samples: 13623


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]


--- STARTING EVALUATION ---


Generating Titles: 100%|██████████| 100/100 [00:48<00:00,  2.07it/s]



           FINAL EVALUATION RESULTS          
ROUGE-1: 45.07%
ROUGE-2: 26.45%
ROUGE-L: 40.95%
BLEU:    8.41%

--- QUALITATIVE EXAMPLES ---

Example 1:
Abstract (Input): Semitic morphologically-rich languages (MRLs) are characterized by extreme
word ambiguity. Because most vowels are omitted in standard texts, many of the
words are homographs with multiple possible an...
Real Title:       Do Pretrained Contextual Language Models Distinguish between Hebrew
  Homograph Analyses?
Generated Title:  Hebrew Contextualized Embeddings for Semitic morphologically-rich Languages
--------------------------------------------------

Example 2:
Abstract (Input): Geophysical inversion attempts to estimate the distribution of physical
properties in the Earth's interior from observations collected at or above the
surface. Inverse problems are commonly posed as l...
Real Title:       Deep learning electromagnetic inversion with convolutional neural
  networks
Generated Title:  Deep Learning for Efficien