In [1]:
# Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive')

# Install required packages
!pip install pandas numpy datasets transformers rouge_score

Mounted at /content/drive
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━

In [2]:
import pandas as pd
import re
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Load the CompScholar dataset
comp_scholar = pd.read_csv("/content/compscholar.csv")

# Drop rows with missing "Document" or "Summary"
comp_scholar = comp_scholar.dropna(subset=["Document", "Summary"])

# For free-mode testing, sample a small subset (e.g., 50 samples)
comp_scholar = comp_scholar.sample(n=min(50, len(comp_scholar)), random_state=42).reset_index(drop=True)

# Simple cleaning: remove extra whitespace
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

comp_scholar["Document"] = comp_scholar["Document"].apply(clean_text)
comp_scholar["Summary"] = comp_scholar["Summary"].apply(clean_text)

# Rename columns for clarity
data = comp_scholar[["Document", "Summary"]].rename(columns={"Document": "text", "Summary": "summary"})

# Split into training and validation sets (we use a small subset for training)
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)
print("Train samples:", len(train_df))
print("Validation samples:", len(val_df))

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

Train samples: 40
Validation samples: 10


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Choose a lightweight summarization model; you can try "sshleifer/distilbart-cnn-12-6"
model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

In [4]:
def tokenize_function(example):
    model_inputs = tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["summary"], truncation=True, padding="max_length", max_length=150)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize both training and validation datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text", "summary"])
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=["text", "summary"])
print("Tokenization complete.")

Map:   0%|          | 0/40 [00:00<?, ? examples/s]



Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Tokenization complete.


In [5]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",       # Note: 'evaluation_strategy' is deprecated; you may also use 'eval_strategy'
    eval_steps=200,
    save_steps=500,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_steps=50,
    fp16=False,                  # Disable fp16 since you're on TPU
    bf16=True,                   # Optional: Use bf16 on TPU for mixed precision
    save_total_limit=1,
    report_to=[]                 # Disable external logging (e.g., wandb)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

print("Trainer is set up. Starting training...")
trainer.train()


  trainer = Trainer(


Trainer is set up. Starting training...


Step,Training Loss,Validation Loss




TrainOutput(global_step=20, training_loss=2.6767452239990233, metrics={'train_runtime': 44.2706, 'train_samples_per_second': 0.904, 'train_steps_per_second': 0.452, 'total_flos': 30958241710080.0, 'train_loss': 2.6767452239990233, 'epoch': 1.0})

In [7]:
import torch
from rouge_score import rouge_scorer
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

nltk.download('punkt')

# Define your summary generation function (adjust parameters as needed)
def generate_summary(text, tokenizer, model, max_input_length=1024, max_output_length=300):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_input_length)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_output_length,
        num_beams=8,            # Increased beams for potentially better outputs
        length_penalty=1.2,     # Adjust length penalty to encourage more detailed summaries
        no_repeat_ngram_size=3,
        early_stopping=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Prepare lists for predictions and references
decoded_preds = []
decoded_refs = []

# Switch model to evaluation mode
model.eval()

# Iterate over your tokenized validation dataset
for i, sample in enumerate(tokenized_val):
    # Decode original text and reference summary (optional: you can store these during tokenization)
    text = tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
    ref = tokenizer.decode(sample["labels"], skip_special_tokens=True)

    # Generate prediction summary
    pred = generate_summary(text, tokenizer, model, max_input_length=1024, max_output_length=300)
    decoded_preds.append(pred)
    decoded_refs.append(ref)

    if (i + 1) % 10 == 0:
        print(f"Processed {i+1}/{len(tokenized_val)} samples.")

# Compute ROUGE Scores using rouge_score package
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def compute_rouge(predictions, references):
    scores = {"rouge1": [], "rouge2": [], "rougeL": []}
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        scores["rouge1"].append(score["rouge1"].fmeasure)
        scores["rouge2"].append(score["rouge2"].fmeasure)
        scores["rougeL"].append(score["rougeL"].fmeasure)
    avg_scores = {key: (sum(scores[key]) / len(scores[key])) * 100 for key in scores}
    return avg_scores

rouge_results = compute_rouge(decoded_preds, decoded_refs)
print("ROUGE Evaluation Results:")
print(rouge_results)

# Compute BLEU Score using NLTK
def compute_bleu(predictions, references):
    smooth_fn = SmoothingFunction().method4
    tokenized_preds = [pred.split() for pred in predictions]
    tokenized_refs = [[ref.split()] for ref in references]  # Each reference wrapped in a list
    bleu_score = corpus_bleu(tokenized_refs, tokenized_preds, smoothing_function=smooth_fn)
    return bleu_score * 100

bleu_score_value = compute_bleu(decoded_preds, decoded_refs)
print("BLEU Score: {:.2f}".format(bleu_score_value))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Processed 10/10 samples.
ROUGE Evaluation Results:
{'rouge1': 51.14644720273398, 'rouge2': 28.515618559590273, 'rougeL': 37.22519857483738}
BLEU Score: 14.68


In [9]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define the save path (adjust folder name/path as desired)
save_path = "/content/drive/MyDrive/fine-tuned-model"

# Save your model and tokenizer (using your Trainer)
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print("Model and tokenizer saved to", save_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model and tokenizer saved to /content/drive/MyDrive/fine-tuned-model


In [11]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Load the saved model and tokenizer from Google Drive
save_path = "/content/drive/MyDrive/fine-tuned-model"
model = AutoModelForSeq2SeqLM.from_pretrained(save_path).to("cpu")  # Use "cuda" if GPU available
tokenizer = AutoTokenizer.from_pretrained(save_path)

# For demonstration, load a research article from your dataset.
# Assume comp_scholar DataFrame is available; if not, reload it.
import pandas as pd
import re

# Load the dataset
comp_scholar = pd.read_csv("/content/compscholar.csv")
comp_scholar = comp_scholar.dropna(subset=["Document", "Summary"])
# Optionally, work with a small subset (e.g., 50 samples)
comp_scholar = comp_scholar.sample(n=min(50, len(comp_scholar)), random_state=42).reset_index(drop=True)

# Clean the text (optional)
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

comp_scholar["Document"] = comp_scholar["Document"].apply(clean_text)
comp_scholar["Summary"] = comp_scholar["Summary"].apply(clean_text)

# Select one research article (e.g., the first one)
sample_article = comp_scholar["Document"].iloc[0]
print("Original Research Article:")
print(sample_article)
print("\n" + "="*80 + "\n")

# Define a summary generation function
def generate_summary(text, tokenizer, model, max_input_length=1024, max_output_length=300):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_input_length)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_output_length,
        num_beams=6,
        length_penalty=1.0,
        no_repeat_ngram_size=3,
        early_stopping=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Generate summary using your saved (fine-tuned) model
your_model_summary = generate_summary(sample_article, tokenizer, model, max_input_length=1024, max_output_length=300)
print("Generated Summary (Your Model):")
print(your_model_summary)
print("\n" + "="*80 + "\n")

Original Research Article:
Sentiment Analysis for Social MediaSentiment Analysis, Data Mining, Twitter.Sentiment analysis, the automated extraction of expressions of positive or negative attitudes from text has received considerable attention from researchers during the past decade. In addition, the popularity of internet users has been growing fast parallel to emerging technologies; that actively use online review sites, social networks and personal blogs to express their opinions. They harbor positive and negative attitudes about people, organizations, places, events, and ideas. The tools provided by natural language processing and machine learning along with other approaches to work with large volumes of text, makes it possible to begin extracting sentiments from social media. In this paper we discuss some of the challenges in sentiment extraction, some of the approaches that have been taken to address these challenges and our approach that analyses sentiments from Twitter social me



Generated Summary (Your Model):
 This article discusses the need for modernizing the use of technology to improve the quality of life in the United States. The article also discusses the importance of modernizing and modernizing technology.




In [15]:
# Import necessary libraries
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, AutoModelForSeq2SeqLM, AutoTokenizer

# Define model paths
pegasus_model_name = "google/pegasus-xsum"  # Pegasus Baseline Model
fine_tuned_model_path = "/content/drive/MyDrive/fine-tuned-model"  # Replace with your model's path

# Load Pegasus model and tokenizer
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_name)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_name)

# Load your fine-tuned model and tokenizer
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained(fine_tuned_model_path)

def generate_summary(text, tokenizer, model, max_input_length=1024, max_output_length=300):
    """
    Generates a summary for a given research article using a specified model.

    Args:
    - text (str): Input research paper text
    - tokenizer: Tokenizer for model
    - model: Model for text summarization
    - max_input_length (int): Maximum token length for input
    - max_output_length (int): Maximum length of summary output

    Returns:
    - str: Generated summary
    """

    # Tokenize the input text and ensure truncation
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="longest", max_length=max_input_length)

    # Error handling for empty input after tokenization
    if inputs["input_ids"].shape[1] == 0:
        raise ValueError("Empty input after tokenization. Check tokenization.")

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_output_length,
        num_beams=5,
        early_stopping=True
    )

    # Decode summary and return
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Sample Research Paper Abstract (Example Text)
sample_article = """
The field of artificial intelligence (AI) has seen rapid advancements in recent years, particularly in deep learning
and natural language processing. This paper explores the latest AI models, including GPT-4, BERT, and Pegasus,
analyzing their applications in text summarization. The study compares different model architectures, training
techniques, and evaluation metrics. Furthermore, challenges in scalability and ethical concerns regarding AI-generated
content are discussed. The results indicate that Pegasus outperforms traditional models in abstractive summarization
while maintaining a high degree of coherence and factual accuracy.
"""

# Generate Summaries
try:
    pegasus_summary = generate_summary(sample_article, pegasus_tokenizer, pegasus_model)
    fine_tuned_summary = generate_summary(sample_article, fine_tuned_tokenizer, fine_tuned_model)

    print("\n🔹 Generated Summary (Pegasus Model):")
    print(pegasus_summary)

    print("\n🔹 Generated Summary (Fine-Tuned Model):")
    print(fine_tuned_summary)

    # Compare Lengths
    print("\n🔹 Comparison:")
    print(f"Pegasus Summary Length: {len(pegasus_summary.split())} words")
    print(f"Fine-Tuned Model Summary Length: {len(fine_tuned_summary.split())} words")

    # Display Key Differences
    print("\n🔹 Key Differences:")
    print("- Pegasus focuses on concise abstractive summaries.")
    print("- Your fine-tuned model should adapt to domain-specific language.")

except Exception as e:
    print("❌ Error:", str(e))


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔹 Generated Summary (Pegasus Model):
Text summarization is one of the most challenging areas of artificial intelligence research.

🔹 Generated Summary (Fine-Tuned Model):
 This paper explores the latest AI models, including GPT-4, BERT, and Pegasus, for their applications in text summarization. The study compares different model architectures, training techniques, and evaluation metrics. The results indicate that Pegasus outperforms traditional models in abstractive summarization, while maintaining a high degree of coherence and factual accuracy. The study also discusses challenges in scalability and ethical concerns regarding AI-generated content.

🔹 Comparison:
Pegasus Summary Length: 13 words
Fine-Tuned Model Summary Length: 63 words

🔹 Key Differences:
- Pegasus focuses on concise abstractive summaries.
- Your fine-tuned model should adapt to domain-specific language.
