In [None]:
# Install the required libraries
!pip install transformers datasets torch




In [None]:
# Import required libraries
import torch
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments


In [None]:
!kaggle datasets download -d yatharthgautam123789/cnn-dailymail-3-0-0
!unzip cnn-dailymail-3-0-0.zip

Dataset URL: https://www.kaggle.com/datasets/yatharthgautam123789/cnn-dailymail-3-0-0
License(s): MIT
Downloading cnn-dailymail-3-0-0.zip to /content
 78% 25.0M/32.2M [00:00<00:00, 34.1MB/s]
100% 32.2M/32.2M [00:00<00:00, 41.4MB/s]
Archive:  cnn-dailymail-3-0-0.zip
  inflating: cnn_dailymail-3.0.0_train_1000_aug-LM_repeat-10.csv  
  inflating: cnn_dailymail-3.0.0_train_1000_aug-no_aug_repeat-10.csv  


In [None]:
# Load the dataset (replace the path with your dataset path)
df = pd.read_csv('/content/cnn_dailymail-3.0.0_train_1000_aug-no_aug_repeat-10.csv')  # Replace with your dataset path

# Sample 1% of the original DataFrame to reduce dataset size for testing
df_sampled = df.sample(frac=0.01, random_state=42).reset_index(drop=True)

# Convert the sampled DataFrame to Hugging Face Dataset format
dataset = Dataset.from_pandas(df_sampled)


In [None]:
# Load the pre-trained T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Preprocessing function for text summarization
def preprocess_summarization(examples):
    # Define the input and target (output) columns
    inputs = examples["article"]  # Replace with the correct column name if different
    targets = examples["highlights"]  # Replace with the correct column name if different

    # Tokenize the inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Use tokenizer to process labels (highlights in this case)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=150, truncation=True, padding="max_length")

    # Add labels to model inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


In [None]:
# Apply preprocessing to the dataset
processed_dataset = dataset.map(preprocess_summarization, batched=True)
# Load the T5 model for conditional generation
model = T5ForConditionalGeneration.from_pretrained('t5-base')


Map:   0%|          | 0/100 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Define training arguments for summarization model fine-tuning
training_args = TrainingArguments(
    output_dir="./results_summarization",  # Directory for saving results
    evaluation_strategy="steps",  # Evaluation after every set of steps
    save_strategy="steps",  # Save checkpoints at steps
    learning_rate=2e-5,  # Fine-tuning learning rate
    per_device_train_batch_size=2,  # Training batch size
    per_device_eval_batch_size=2,  # Evaluation batch size
    num_train_epochs=1,  # Number of epochs for training
    weight_decay=0.01,  # Weight decay for regularization
    logging_dir='./logs',  # Directory for logging
    logging_steps=500,  # Frequency of logging
    save_steps=1000,  # Frequency of saving model checkpoints
    eval_steps=500,  # Frequency of evaluation
    save_total_limit=3,  # Limit the total number of saved checkpoints
    fp16=False,  # Set to True if using GPU with mixed precision
    gradient_accumulation_steps=2,  # Accumulate gradients to simulate larger batch size
)
# Initialize the Trainer with model, training args, and datasets
trainer = Trainer(
    model=model,  # Pre-trained T5 model
    args=training_args,  # Training arguments defined above
    train_dataset=processed_dataset,  # The processed training dataset
    eval_dataset=processed_dataset,  # Use same dataset for evaluation (or split into validation set)
)
# Train the summarization model
trainer.train()




Step,Training Loss,Validation Loss


TrainOutput(global_step=25, training_loss=7.151566162109375, metrics={'train_runtime': 37.396, 'train_samples_per_second': 2.674, 'train_steps_per_second': 0.669, 'total_flos': 60895789056000.0, 'train_loss': 7.151566162109375, 'epoch': 1.0})

In [None]:
# Save the model after training
model.save_pretrained("./trained_t5_summarization_model")
tokenizer.save_pretrained("./trained_t5_summarization_tokenizer")


('./trained_t5_summarization_tokenizer/tokenizer_config.json',
 './trained_t5_summarization_tokenizer/special_tokens_map.json',
 './trained_t5_summarization_tokenizer/spiece.model',
 './trained_t5_summarization_tokenizer/added_tokens.json')

In [None]:
test_article = "Erwin Schrödinger's experiment, known as Schrödinger's Cat, illustrates quantum superposition and measurement. In the thought experiment, a cat is placed in a sealed box with a radioactive atom, a Geiger counter, a vial of poison, and a hammer. If the atom decays, the Geiger counter triggers the hammer to break the vial, killing the cat. Quantum mechanics implies the atom is in a superposition of decayed and not decayed states, making the cat simultaneously alive and dead until observed. Schrödinger used this paradox to highlight the strangeness of quantum mechanics when applied to everyday objects."

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)


inputs = tokenizer.encode("summarize: " + test_article, return_tensors="pt", max_length=512, truncation=True)

inputs = inputs.to(device)

summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)


Generated Summary: a cat is placed in a box with a radioactive atom, a Geiger counter, a vial of poison, and a hammer. if the atom decays, the Geiger counter triggers the hammer, killing the cat. quantum mechanics implies the atom is in a superposition of decayed states.


In [None]:
# Install transformers library if not already installed
!pip install transformers

# Import necessary libraries
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration

# Load the M2M100 model and tokenizer for English to Hindi translation
model_name = 'facebook/m2m100_418M'
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

# Function to translate English text to Hindi
def translate_to_hindi(text):
    # Prepare the text for translation
    tokenizer.src_lang = "en"  # Set source language to English
    encoded_input = tokenizer(text, return_tensors="pt")

    # Generate translation
    translated = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id("hi"))

    # Decode the translation
    hindi_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    return hindi_text





Hindi: एक बिल्ली को एक रेडियोधर्मी परमाणु के साथ एक बॉक्स में रखा जाता है, एक गीगर कॉन्ट्रेटर, एक विषाक्त वायरल, और एक हैमर. अगर परमाणु विघटित हो जाता है, तो गीगर कॉन्ट्रेटर हैमर को उत्सर्जित करता है, बिल्ली को मारता है. क्वांटम मैकेनिकल का मतलब है कि परमाणु विघटित राज्यों की एक सतह में है.


In [None]:
# Example usage
print("Generated Summary:", summary)

hindi_translation = translate_to_hindi(summary)
print(f"Hindi: {hindi_translation}")

Generated Summary: a cat is placed in a box with a radioactive atom, a Geiger counter, a vial of poison, and a hammer. if the atom decays, the Geiger counter triggers the hammer, killing the cat. quantum mechanics implies the atom is in a superposition of decayed states.
Hindi: एक बिल्ली को एक रेडियोधर्मी परमाणु के साथ एक बॉक्स में रखा जाता है, एक गीगर कॉन्ट्रेटर, एक विषाक्त वायरल, और एक हैमर. अगर परमाणु विघटित हो जाता है, तो गीगर कॉन्ट्रेटर हैमर को उत्सर्जित करता है, बिल्ली को मारता है. क्वांटम मैकेनिकल का मतलब है कि परमाणु विघटित राज्यों की एक सतह में है.
