## **bert encoder**

In [None]:
!pip install transformers datasets torch nltk


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import torch
from transformers import BertTokenizer, EncoderDecoderModel
# from datasets import load_dataset
import pandas as pd
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Set device to CUDA if available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your CSV data (replace with your file path)
file_path = '/content/post_prewithno_start.csv'  # Replace with the path to your CSV file
df = pd.read_csv(file_path)
df = df.head(10000)  # Use only the first 100 rows for quick testing

# Assuming the CSV has columns "text" (input) and "summary" (target)
texts = df['text'].tolist()
summaries = df['summary'].tolist()

# Preprocess: split data into train and validation sets
train_texts, val_texts, train_summaries, val_summaries = train_test_split(
    texts, summaries, test_size=0.1, random_state=42
)

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocessing function
def preprocess_data(texts, summaries, max_input_length=512, max_output_length=40):
    texts = [str(text) for text in texts]  # Convert all inputs to strings
    summaries = [str(summary) for summary in summaries]  # Convert all outputs to strings
    inputs = tokenizer(
        texts, max_length=max_input_length, truncation=True, padding="max_length", return_tensors="pt"
    )
    outputs = tokenizer(
        summaries, max_length=max_output_length, truncation=True, padding="max_length", return_tensors="pt"
    )
    return inputs, outputs

# Preprocess the training and validation data
train_inputs, train_outputs = preprocess_data(train_texts, train_summaries)
val_inputs, val_outputs = preprocess_data(val_texts, val_summaries)

# Custom Dataset class
class SummarizationDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return self.inputs['input_ids'].size(0)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': self.outputs['input_ids'][idx],
        }

# Create Dataset and DataLoader
train_dataset = SummarizationDataset(train_inputs, train_outputs)
val_dataset = SummarizationDataset(val_inputs, val_outputs)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Initialize the Encoder-Decoder model
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "bert-base-uncased", "bert-base-uncased"
)
model.to(device)

# Set special tokens for the model (if not already set during training)
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size
model.config.eos_token_id = tokenizer.sep_token_id
model.config.max_length = 128
model.config.min_length = 30
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.early_stopping = True

# Define optimizer
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
def train_model(model, train_loader, val_loader, epochs=3):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        epoch_accuracy = 0
        total_tokens = 0
        correct_tokens = 0
        for batch in tqdm(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            loss.backward()
            optimizer.step()

            # Compute token-level accuracy
            predicted_ids = torch.argmax(logits, dim=-1)
            correct_tokens += (predicted_ids == labels).sum().item()
            total_tokens += labels.numel()
            epoch_loss += loss.item()

        epoch_accuracy = correct_tokens / total_tokens
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_loader)}, Accuracy: {epoch_accuracy * 100:.2f}%")

        validate_model(model, val_loader)

# Validation loop
def validate_model(model, val_loader):
    model.eval()
    val_loss = 0
    val_accuracy = 0
    total_tokens = 0
    correct_tokens = 0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            logits = outputs.logits
            predicted_ids = torch.argmax(logits, dim=-1)
            correct_tokens += (predicted_ids == labels).sum().item()
            total_tokens += labels.numel()

    val_accuracy = correct_tokens / total_tokens
    print(f"Validation Loss: {val_loss / len(val_loader)}, Validation Accuracy: {val_accuracy * 100:.2f}%")

# Train the model
train_model(model, train_loader, val_loader, epochs=8)

# Function to generate summaries for new texts


# Example: Generate a summary for a new text

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

Epoch 1/8, Loss: 3.279447867075602, Accuracy: 63.37%


100%|██████████| 125/125 [00:35<00:00,  3.57it/s]


Validation Loss: 9.222363151550294, Validation Accuracy: 2.50%


100%|██████████| 1125/1125 [17:13<00:00,  1.09it/s]


Epoch 2/8, Loss: 2.9160825493070814, Accuracy: 63.55%


100%|██████████| 125/125 [00:35<00:00,  3.56it/s]


Validation Loss: 2.3045787076950073, Validation Accuracy: 67.90%


100%|██████████| 1125/1125 [17:15<00:00,  1.09it/s]


Epoch 3/8, Loss: 1.945189141485426, Accuracy: 69.22%


100%|██████████| 125/125 [00:35<00:00,  3.57it/s]


Validation Loss: 1.9588012819290161, Validation Accuracy: 69.86%


100%|██████████| 1125/1125 [17:15<00:00,  1.09it/s]


Epoch 4/8, Loss: 1.556448768403795, Accuracy: 71.52%


100%|██████████| 125/125 [00:35<00:00,  3.56it/s]


Validation Loss: 1.8479222450256347, Validation Accuracy: 70.53%


100%|██████████| 1125/1125 [17:15<00:00,  1.09it/s]


Epoch 5/8, Loss: 1.2219592928356595, Accuracy: 74.36%


100%|██████████| 125/125 [00:35<00:00,  3.55it/s]


Validation Loss: 1.7155531330108642, Validation Accuracy: 72.17%


100%|██████████| 1125/1125 [17:15<00:00,  1.09it/s]


Epoch 6/8, Loss: 0.9947020494143168, Accuracy: 77.17%


100%|██████████| 125/125 [00:35<00:00,  3.54it/s]


Validation Loss: 1.7095420627593994, Validation Accuracy: 72.61%


100%|██████████| 1125/1125 [17:15<00:00,  1.09it/s]


Epoch 7/8, Loss: 0.7572165556483799, Accuracy: 80.77%


100%|██████████| 125/125 [00:35<00:00,  3.55it/s]


Validation Loss: 1.6796447200775146, Validation Accuracy: 74.01%


100%|██████████| 1125/1125 [17:16<00:00,  1.09it/s]


Epoch 8/8, Loss: 0.593270792775684, Accuracy: 83.85%


100%|██████████| 125/125 [00:35<00:00,  3.54it/s]

Validation Loss: 1.6894993467330932, Validation Accuracy: 74.26%





In [None]:
model.config.num_beams = 4  # Enable beam search with 4 beams
model.config.early_stopping = True
model.config.length_penalty = 2.0


In [None]:
import os
import shutil

# Save model and tokenizer
output_dir = "trained_model"
os.makedirs(output_dir, exist_ok=True)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Create a zip file
shutil.make_archive("trained_model", 'zip', output_dir)

print("Model saved and zipped as 'trained_model.zip'")




Model saved and zipped as 'trained_model.zip'


In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=8dbc41123b21075cb3900279ab715bffe48b81817ea51dc50334095290b65b69
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
file_size = os.path.getsize("/content/trained_model.zip")

# Convert file size to KB or MB
file_size_kb = file_size / 1024  # Size in KB
file_size_mb = file_size_kb / 1024  # Size in MB

# Display the ZIP file size
print(f"ZIP File Size: {file_size} bytes")
print(f"ZIP File Size: {file_size_kb:.2f} KB")
print(f"ZIP File Size: {file_size_mb:.2f} MB")

ZIP File Size: 916545486 bytes
ZIP File Size: 895063.95 KB
ZIP File Size: 874.09 MB


In [None]:
# Test on new data
def summarize(text):
    inputs = tokenizer(
        text, max_length=512, truncation=True, padding="max_length", return_tensors="pt"
    ).to(device)
    summary_ids = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=50,
        num_beams=6,
        early_stopping=True,
        decoder_start_token_id=model.config.pad_token_id  # Fix the issue
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

sample_text = "Kunal Shah's credit card bill payment platform, CRED, gave users a chance to win free food from Swiggy for one year. Pranav Kaushik, a Delhi techie, bagged this reward after spending 2000 CRED coins. Users get one CRED coin per rupee of bill paid, which can be used to avail rewards from brands like Ixigo, BookMyShow, UberEats, Cult.Fit and more."
print("Original Text:", sample_text)
print("Summary:", summarize(sample_text))


Original Text: Kunal Shah's credit card bill payment platform, CRED, gave users a chance to win free food from Swiggy for one year. Pranav Kaushik, a Delhi techie, bagged this reward after spending 2000 CRED coins. Users get one CRED coin per rupee of bill paid, which can be used to avail rewards from brands like Ixigo, BookMyShow, UberEats, Cult.Fit and more.
Summary: video video video


## **bert with t5 decoder**

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
file_path = '/content/news_summary_more.csv'  # Replace with your file path
df = pd.read_csv(file_path)
df = df.dropna(subset=["text", "headlines"])  # Drop rows with missing data
df = df.head(1000)  # Use a subset for testing

# Split data into training and validation sets
texts = df['text'].tolist()
summaries = df['headlines'].tolist()
train_texts, val_texts, train_summaries, val_summaries = train_test_split(
    texts, summaries, test_size=0.1, random_state=42
)

# Initialize tokenizer and model
model_name = "t5-small"  # Free and efficient pre-trained model for summarization
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.to(device)

# Preprocessing function
def preprocess_data(texts, summaries, tokenizer, max_input_length=100, max_output_length=15):
    inputs = [f"summarize: {text}" for text in texts]
    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True, padding="max_length", return_tensors="pt"
    )
    labels = tokenizer(
        summaries, max_length=max_output_length, truncation=True, padding="max_length", return_tensors="pt"
    )
    labels["input_ids"][labels["input_ids"] == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Preprocess training and validation data
train_data = preprocess_data(train_texts, train_summaries, tokenizer)
val_data = preprocess_data(val_texts, val_summaries, tokenizer)

# Dataset class
class SummarizationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data["input_ids"])

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.data.items()}

train_dataset = SummarizationDataset(train_data)
val_dataset = SummarizationDataset(val_data)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Optimizer and Scheduler
from transformers import AdamW, get_scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training function
def train_model(model, train_loader, val_loader, optimizer, lr_scheduler, epochs=3):
    model.train()
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        total_loss = 0
        for batch in tqdm(train_loader):
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

        print(f"Training Loss: {total_loss / len(train_loader)}")
        validate_model(model, val_loader)

# Validation function
def validate_model(model, val_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**batch)
            total_loss += outputs.loss.item()
    print(f"Validation Loss: {total_loss / len(val_loader)}")
    model.train()

# Train the model
train_model(model, train_loader, val_loader, optimizer, lr_scheduler, epochs=3)

# Generate summaries
def generate_summary(model, tokenizer, text, max_length=15):
    input_text = f"summarize: {text}"
    inputs = tokenizer(
        input_text, max_length=100, truncation=True, padding="max_length", return_tensors="pt"
    ).to(device)
    outputs = model.generate(
        inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=max_length, num_beams=4
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test example
example_text = "Person1 Hi Mr Smith Im Doctor Hawkins Why are you here today Person2 I found it would be a good idea to get a checkup Person1 Yes well you havent had one for 5 years You should have one every year Person2 I know I figure as long as there is nothing wrong why go see the doctor Person1 Well the best way to avoid serious illnesses is to find out about them early So try to come at least once a year for your own good Person2 Ok Person1 Let me see here Your eyes and ears look fine Take a deep breath please Do you smoke Mr Smith Person2 Yes Person1 Smoking is the leading cause of lung cancer and heart disease you know You really should quit Person2 Ive tried hundreds of times but I just cant seem to kick the habit Person1 Well we have classes and some medications that might help Ill give you more information before you leave Person2 Ok thanks doctor"
summary = generate_summary(model, tokenizer, example_text)
print(f"Generated Summary: {summary}")


Using device: cuda

Epoch 1/3


100%|██████████| 113/113 [00:09<00:00, 11.70it/s]


Training Loss: 2.97870525849604


100%|██████████| 13/13 [00:00<00:00, 50.06it/s]


Validation Loss: 2.4727455285879283

Epoch 2/3


100%|██████████| 113/113 [00:09<00:00, 11.70it/s]


Training Loss: 2.4658943952712336


100%|██████████| 13/13 [00:00<00:00, 49.86it/s]


Validation Loss: 2.299238544244033

Epoch 3/3


100%|██████████| 113/113 [00:09<00:00, 12.47it/s]


Training Loss: 2.347443833815313


100%|██████████| 13/13 [00:00<00:00, 36.65it/s]


Validation Loss: 2.265897796704219
Generated Summary: I found it would be a good idea to get a check


In [None]:
example_text = "Speaking about the sexual harassment allegations against Rajkumar Hirani, Sonam Kapoor said, I've known Hirani for many years...What if it's not true, the [#MeToo] movement will get derailed." "In the #MeToo movement, I always believe a woman. But in this case, we need to reserve our judgment, she added. Hirani has been accused by an assistant who worked in 'Sanju'."
summary = generate_summary(model, tokenizer, example_text)
print(f"Generated Summary: {summary}")


Generated Summary: I've known Hirani for many years
