In [1]:
!pip install -U transformers datasets evaluate accelerate
!pip install scikit-learn
!pip install tensorboard

Collecting transformers
  Downloading transformers-4.50.1-py3-none-any.whl.metadata (39 kB)
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting accelerate
  Downloading accelerate-1.5.2-py3-none-any.whl.metadata (19 kB)
Downloading transformers-4.50.1-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hDownloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-1.5.2-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=278e3217ea8803646b7d0f667d27e9161c5659183fc6a04063272b5070d659fb
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [3]:
import torch
import pprint
import evaluate
import numpy as np
 
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
dataset = load_dataset('gopalkalpande/bbc-news-summary', split='train')
full_dataset = dataset.train_test_split(test_size=0.2, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']
 
print(dataset_train)
print(dataset_valid)

README.md:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

bbc-news-summary.csv:   0%|          | 0.00/7.32M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2224 [00:00<?, ? examples/s]

Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 1779
})
Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 445
})


In [31]:
# Select an article (e.g., the first one)
sample_article = dataset_train[0]['Articles']
sample_summary = dataset_train[0]['Summaries']

# Print length of article and summary
print(f"Article Length: {len(sample_article)} characters")
print(f"Article: {sample_article}\n")
print(f"Summary: {sample_summary}")

Article Length: 2777 characters



In [4]:
MODEL = 't5-base'
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'results_t5base'
MAX_LENGTH = 512 # Maximum context length to consider while preparing dataset.
tokenizer = T5Tokenizer.from_pretrained(MODEL)
 
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['Articles']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )
 
    # Set up the tokenizer for targets
    targets = [summary for summary in examples['Summaries']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )
 
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
 
# Apply the function to the whole dataset
tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map (num_proc=4):   0%|          | 0/1779 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/445 [00:00<?, ? examples/s]



In [5]:
model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")
rouge = evaluate.load("rouge")
 
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids
 
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )
 
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
 
    return {k: round(v, 4) for k, v in result.items()}

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

222,903,552 total parameters.
222,903,552 training parameters.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [6]:
import torch
import pprint
import evaluate
import numpy as np
 
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
dataset = load_dataset('gopalkalpande/bbc-news-summary', split='train')
full_dataset = dataset.train_test_split(test_size=0.2, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']
 
print(dataset_train)
print(dataset_valid)
MODEL = 't5-base'
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'results_t5base'
MAX_LENGTH = 512 # Maximum context length to consider while preparing dataset.
tokenizer = T5Tokenizer.from_pretrained(MODEL)
 
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['Articles']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )
 
    # Set up the tokenizer for targets
    targets = [summary for summary in examples['Summaries']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )
 
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
 
# Apply the function to the whole dataset
tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)
model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")
rouge = evaluate.load("rouge")
 
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids
 
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )
 
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
 
    return {k: round(v, 4) for k, v in result.items()}
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=0.0001,
    dataloader_num_workers=4
)
 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)
 
history = trainer.train()
model_path = f"{OUT_DIR}"  # the path where you saved your model
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(OUT_DIR)


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,0.326,0.412758,0.8972,0.8234,0.8782,238.2517
400,0.3961,0.365527,0.9035,0.8331,0.8855,238.5281
600,0.388,0.351529,0.9069,0.8374,0.8891,238.5281




In [15]:
import os
print(OUT_DIR)  # Check what the variable is storing
print(os.listdir(OUT_DIR))  # See if tokenizer files exist


results_t5base
['checkpoint-446', 'checkpoint-669', 'events.out.tfevents.1743015759.fadca9d4e559.69.0']


In [17]:
from transformers import T5Tokenizer

# Load tokenizer from Hugging Face
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Save inside results_t5base so you can reuse it
tokenizer.save_pretrained("results_t5base")


('results_t5base/tokenizer_config.json',
 'results_t5base/special_tokens_map.json',
 'results_t5base/spiece.model',
 'results_t5base/added_tokens.json')

In [19]:
tokenizer = T5Tokenizer.from_pretrained("results_t5base")


In [24]:
input_text = "summarize: You're Malak Raaf, a senior Computer Science student at Ain Shams University. You have a strong interest in data science and are currently looking for data engineering opportunities. You’ve worked extensively with optimization problems, particularly related to transportation and logistics, using Excel and Python. Your projects involve data processing, linear programming, and GUI development for data applications. You've also worked with arXiv educational article datasets, indicating an interest in NLP or academic data analysis."


In [25]:
input_ids = tokenizer(input_text, return_tensors="pt").input_ids


In [26]:
output_ids = model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)
summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Summary:", summary)


Summary: You're Malak Raaf, a senior Computer Science student at Ain Shams University. You’ve worked extensively with optimization problems, particularly related to transportation and logistics, using Excel and Python. Your projects involve data processing, linear programming, and GUI development for data applications. You've also worked with arXiv educational article datasets, indicating an interest in NLP or academic data analysis.


In [32]:
input_text_3 = """
Web helps collect aid donations..The web is helping aid agencies gather resources to help cope with the aftermath of the tsunami disaster...Many people are making donations via websites or going online to see how they can get involved with aid efforts. High-profile web portals such as Google, Yahoo, Ebay and Amazon are gathering links that lead people to aid and relief organisations. So many were visiting some aid-related sites that some webpages were struggling to cope with the traffic. An umbrella organisation called the Disasters Emergency Committee (DEC) has been set up by a coalition of 12 charities and has been taking many donations via its specially created website. It urged people to go online where possible to help because donations could be processed more quickly than cash donated in other ways, meaning aid could be delivered as quickly as possible. The site has so far received almost £8 million, with more than 11,000 donations being made online every hour...Telco BT stepped in to take over the secure payments on the DEC site and provided extra logistical support for phone and online appeals after it was initially crippled with online donations. It has also provided space in London's BT tower for one of the call centres dealing with donations...Some of the web's biggest firms are also helping to channel help by modifying their homepages to include links to aid agencies and organisations collecting resources. On its famously sparse homepage Google has placed a link that leads users to a list of sites where donations can be made. Among the 17 organisations listed are Oxfam, Medecins sans Frontieres (Doctors Without Borders) and Network for Good. Many of the sites that Google lists are also taking online donations. Online retailer Amazon has put a large message on its start page that lets people donate money directly to the American Red Cross that will be used with relief efforts. Auction site eBay is giving a list of sites that people can either donate directly to, divert a portion of their profits from anything they sell on eBay to the listed organisations or simply buy items that direct cash to those in the list. Yahoo is proving links direct to charities for those that want to donate. The Auction Drop website is asking people to donate old digital cameras, computers and other gadgets they no longer want that can be auction to raise cash for the aid effort. Sadly, the outpouring of goodwill has also encouraged some conmen to try to cash in. Anti-fraud organisations are warning about e-mails that are starting to circulate which try to convince people to send money directly to them rather than make donations via aid agencies. Those wanting to give cash were urged to use legitimate websites of charities and aid agencies.
"""
input_ids = tokenizer(input_text_3, return_tensors="pt").input_ids

output_ids = model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)
summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Summary:", summary)


Summary: many of the sites that Google lists are also taking online donations.Auction site eBay is giving a list of sites that people can either donate directly to, divert a portion of their profits from anything they sell on eBay to the listed organisations or simply buy items that direct cash to those in the list.The web is helping aid agencies gather resources to help cope with the aftermath of the tsunami disaster.Auction site eBay is giving a list of sites that people can either donate directly to, divert a portion of their profits from anything they sell on eBay to the listed organisations or simply buy items that direct cash to those in the list.Some of the web's biggest firms are also helping to channel help


In [33]:
# Load an article and its original summary from dataset
sample_article = dataset_train[0]['Articles']
original_summary = dataset_train[0]['Summaries']

# Print original summary
print(f"Original Summary: {original_summary}\n")

# Tokenize the article
input_ids = tokenizer(sample_article, return_tensors="pt").input_ids

# Generate summary using T5 model
output_ids = model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)
generated_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print generated summary
print(f"Generated Summary: {generated_summary}\n")

# Compare using ROUGE Score
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(original_summary, generated_summary)

print("ROUGE Scores:", scores)



Generated Summary: many of the sites that Google lists are also taking online donations.Auction site eBay is giving a list of sites that people can either donate directly to, divert a portion of their profits from anything they sell on eBay to the listed organisations or simply buy items that direct cash to those in the list.The web is helping aid agencies gather resources to help cope with the aftermath of the tsunami disaster.Auction site eBay is giving a list of sites that people can either donate directly to, divert a portion of their profits from anything they sell on eBay to the listed organisations or simply buy items that direct cash to those in the list.Some of the web's biggest firms are also helping to channel help

ROUGE Scores: {'rouge1': Score(precision=0.6717557251908397, recall=0.43564356435643564, fmeasure=0.5285285285285286), 'rouge2': Score(precision=0.45384615384615384, recall=0.2935323383084577, fmeasure=0.3564954682779456), 'rougeL': Score(precision=0.53435114503

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [37]:
def cosine_similarity_text(text1, text2):
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1] * 100  # Convert to percentage

similarity_cosine = cosine_similarity_text(original_summary, generated_summary)
print(f"Cosine Similarity: {similarity_cosine:.2f}%")


Cosine Similarity: 73.75%


In [38]:
# Load an article and its original summary from dataset
sample_article = dataset_train[2]['Articles']
original_summary = dataset_train[2]['Summaries']

# Print original summary
print(f"Original Summary: {original_summary}\n")

# Tokenize the article
input_ids = tokenizer(sample_article, return_tensors="pt").input_ids

# Generate summary using T5 model
output_ids = model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)
generated_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print generated summary
print(f"Generated Summary: {generated_summary}\n")

# Compare using ROUGE Score
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(original_summary, generated_summary)

print("ROUGE Scores:", scores)


Original Summary: "He had lost three times to Roddick, and this was his day to beat him."Some people have said that I am obsessed but I think that it is better this way.Spain's victory was also remarkable for the performance of Rafael Nadal, who beat Roddick in the opening singles.Carlos Moya described Spain's Davis Cup victory as the highlight of his career after he beat Andy Roddick to end the USA's challenge in Seville.Spain's only other Davis Cup title came two years ago in Valencia, when they beat Australia.And Moya, nicknamed Charly, admitted: "The Davis Cup is my dream and I was a bit nervous at the outset."What a great way to finish the year," said Nadal afterwards."But certainly I think we can put the work in at the appropriate time and play a couple more events and play against these guys who are the best on this stuff," said McEnroe.Roddick was left frustrated after losing both his singles on the slow clay of Seville's Olympic Stadium."It's just tough because I felt like I w

In [39]:
def cosine_similarity_text(text1, text2):
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1] * 100  # Convert to percentage

similarity_cosine = cosine_similarity_text(original_summary, generated_summary)
print(f"Cosine Similarity: {similarity_cosine:.2f}%")


Cosine Similarity: 80.02%


In [41]:
inputt_text = """First: Plant Classification  
Data Preparation 
1. Setting Constants
image_size: Defines the target size to which images will be resized (224x224 pixels).
batch_size: The number of images to process at once during training. (64 batches)
2. Loading Data
This function loads images and their corresponding labels from the specified directory.
3. Normalizing Images
The images are normalized by dividing each pixel's value by 255.0, converting the pixel values from 
the range [0, 255] to [0, 1]. This step helps improve the convergence during model training.
4. Splitting the Data into Train and Validation Sets
• This splits the loaded data into training and validation sets, with 80% of the data used for 
training and 20% for validation 
7. Label Encoding
• The LabelEncoder is used to convert text labels into integer labels 
8. Class Weight Calculation
• The class weights are computed to handle class imbalance. The compute_class_weight 
function calculates weights inversely proportional to class frequencies.
10. Data Augmentation
We apply augmentation only in MobileNet, VGG, AlexNet, as we don't apply augmentation in 
ViT. The train_datagen applies several data augmentation techniques to the training images, 
including: Random rotation (rotation_range), width/height shifting, shearing, zooming, and flipping 
the images horizontally. val_datagen doesn't apply any augmentation
These augmentations help improve model generalization by providing more varied input data.
11. Data Generators
train_generator and validation_generator and testing_generator are instances of 
ImageDataGenerator that yield batches of images and labels during training and validation, 
respectively
Vision Transformer (ViT)
Architecture
• Divides input images into fixed-size non-overlapping patches (e.g., 16×1616 \times 
1616x16).
• Converts patches into 1D vector embeddings via a linear projection.
• Adds positional embeddings to preserve spatial relationships.
• Processes embeddings using Transformer encoder layers (self-attention + feedforward 
networks).
• Uses a learnable [CLS] token for classification.
• Final output is passed to a classification head for tasks like image classification.
Advantages
• Scalability: Performs better with larger datasets.
• Global Context: Captures global relationships across the image.
• Flexibility: Can adapt to multi-modal tasks beyond vision (e.g., vision + text).
• Reduced Inductive Bias: Learns more adaptively compared to CNNs.
• Improved Performance: Outperforms CNNs on benchmarks when pre-trained on large 
datasets.
• Parallelization: Faster training due to sequence-level parallel processing.
• Transfer Learning: Pre-trained ViTs generalize well to other tasks.
Challenges
• Data Requirements: Needs large-scale datasets for effective training.
• Computational Cost: High memory and computation demands due to quadratic self attention complexity.
• Overfitting: Prone to overfitting on smaller datasets.
• Interpretability: Harder to interpret learned features compared to CNNs.MobileNet
MobileNet is a lightweight deep learning model designed for mobile and embedded devices, 
prioritizing efficiency and speed. It uses depthwise separable convolutions to reduce the number of 
parameters and computations. This architecture is well-suited for tasks like image classification 
and object detection on resource-constrained devices. Despite its simplicity, it achieves 
competitive accuracy compared to larger models.
Architecture:
Input: Images of size 224x224x3 (RGB).
Base Model:
• MobileNet (pre-trained on ImageNet, without the top classification layers).
• Lightweight and efficient architecture, designed with depthwise separable convolutions for 
reduced computational complexity.
• Base model layers are frozen (not trainable).
Custom Layers:
• Global Average Pooling (GAP): Reduces the spatial dimensions of the feature maps to a 
single vector for each channel, summarizing the spatial information globally.
• Dense Layer: Fully connected layer with 1024 units and ReLU activation.
• Dropout Layer: Dropout with a rate of 0.5 to reduce overfitting.
• Output Layer: Dense layer with num_classes units and softmax activation for classification.
Optimization:
• Uses Adam optimizer, categorical cross-entropy loss, and accuracy as a performance 
metric.
Output: Class probabilities for the given number of output classes 
 Best for Resource-Constrained Devices: MobileNet
• Why: MobileNet is optimized for efficiency and speed, making it ideal for mobile and 
embedded devices. Despite its smaller size, it delivers competitive performance on tasks 
like image classification and object detection.VGG16
VGG is a deep convolutional neural network known for its simplicity and uniform architecture, 
consisting of sequential 3x3 convolutional layers followed by fully connected layers. It comes in 
variations like VGG-16 and VGG-19, named for the number of layers. VGG models are 
computationally expensive but deliver high accuracy in image classification. Their deep and 
uniform structure has influenced the design of many subsequent models.
Architecture:
Input:
• Images of size 224x224x3 (RGB).
Base Model:
• VGG16 (pre-trained on ImageNet, without the top classification layers).
• Contains 13 convolutional layers grouped into 5 blocks, each followed by max-pooling 
layers for feature extraction.
Custom Layers:
• Flatten: Converts feature maps from VGG16 into a 1D vector.
• Dense Layer 1: Fully connected layer with 4096 units and ReLU activation.
• Dropout Layer 1: Dropout with a rate of 0.5 to reduce overfitting.
• Dense Layer 2: Fully connected layer with 4096 units and ReLU activation.
• Dropout Layer 2: Another dropout with a rate of 0.5.
• Output Layer: Dense layer with num_classes units and softmax activation for classification.
Optimization:
• Uses Adam optimizer, categorical cross-entropy loss, and accuracy as a performance 
metric.
 Output: Class probabilities for the given number of output classes 
Best for High Accuracy on Large Datasets: VGG
• Why: VGG models, particularly VGG-16 and VGG-19, provide high accuracy due to their 
deeper architecture and consistent design. They are well-suited for applications requiring 
precise feature extraction.
U-Net Model
U-Net is a convolutional neural network architecture specifically designed for biomedical image 
segmentation. It has a symmetrical encoder-decoder structure, where the encoder extracts 
features, and the decoder reconstructs the image with segmentation masks. Skip connections link 
corresponding layers in the encoder and decoder to preserve spatial information. U-Net is highly 
efficient and performs well on small datasets, making it a popular choice in medical imaging tasks.
Architecture
Define U-Net Blocks:
• Implemented a convolutional block (conv_block) that includes two convolutional layers 
with ReLU activation, kernel initialization, and dropout for regularization.
• Created an upsampling block (upsample_block) using transposed convolution for 
upsampling and concatenation of features from previous layers.
Contracting Path:
• Used sequential convolutional blocks (conv_block) and max-pooling layers to reduce 
spatial dimensions while increasing the number of feature channels:
• Encoder: Extracts and compresses features from the input (downsampling).
Expanding Path:
• Applied upsampling blocks to reconstruct spatial dimensions and combine features from 
the contracting path:
• Decoder: Reconstructs the spatial dimensions and combines extracted features 
• These stages are connected by the bottleneck layer (c5), which acts as the transition point 
between the encoder and decoder.
Output Layer:
• Added a final convolutional layer with 1 filter and sigmoid activation to produce a 
probability map for binary segmentation.
• Model Training:
• Defined callbacks for early stopping and saving the best model:
o EarlyStopping monitored validation loss with a patience of 5 epochs.
o ModelCheckpoint saved the best model during training.
Model Saving:
• Saved the trained model in HDF5 format (model.h5).
SAM Model
SAM is based on a foundation of transformer models, leveraging the power of attention 
mechanisms to learn spatial relationships within images for precise segmentation. SAM uses a 
vision transformer (ViT) as its backbone. Vision transformers have self-attention mechanisms that 
allow the model to capture long-range dependencies between pixels.
Architecture:
The main parts of the SAM architecture include:
• Backbone (Vision Transformer - ViT): This is the core architecture of SAM, where image 
features are extracted.
• Prompt Encoder: This component processes the different types of input prompts (points, 
boxes, and masks) to guide the segmentation.
• Segmentation Decoder: This part decodes the model’s predictions into final segmentation 
masks.
Dice Loss Advantages:
• Handling Imbalanced Data: Dice Loss is particularly useful when the dataset is 
imbalanced.
Second: Plant Disease Recognition
Siamese Architecture: A neural network designed to determine the similarity or dissimilarity 
between two inputs.
Twin Networks: Consists of two identical sub-networks that share the same weights and 
parameters.
Shared Weights: Both sub-networks learn the same features from the input data, ensuring 
consistent comparisons.
Distance Metric: Outputs (feature vectors) from the sub-networks are compared using a 
distance metric like Euclidean distance or cosine similarity.
Training: Network is trained with pairs of images labeled as similar or dissimilar, adjusting 
parameters to bring similar images closer and dissimilar ones farther apart.
Application: Commonly used in tasks such as plant recognition or image matching where 
pairwise comparisons are necessary
Advantages of One-shot Learning in Plant Recognition:
• Reduced Data Requirements: Recognizes plant species with just one image per species, 
reducing the need for large labeled datasets.
• Generalization: Effectively generalizes to new, unseen plant species, especially with 
models like Siamese or Prototypical Networks.
AlexNet
AlexNet is a pioneering deep learning model that popularized convolutional neural networks in the 
2012 ImageNet competition. It uses five convolutional layers, followed by three fully connected 
layers, and employs techniques like ReLU activation, dropout, and data augmentation. AlexNet 
significantly reduced error rates at the time and laid the foundation for modern deep learning in 
computer vision.
Architecture:
Input:
• Accepts images of size 224x224x3 (RGB).
Feature Extraction (Convolutional and Pooling Layers):
• 5 convolutional layers: filters with ReLU activation.Followed by MaxPooling
Flatten and Dense Layers:
• Flatten: Converts the extracted features into a 1D vector.
• Dense Layer 1 & Dense Layer 2: 4096 units, with ReLU activation.Followed by Dropout (rate 
0.5) to reduce overfitting.
Output Layer: A dense layer with num_classes units and softmax activation
Optimization:
• Uses Adam optimizer, categorical cross-entropy loss, and accuracy as a performance 
metric.
Output: Produces class probabilities for classification tasks 
Worst Model: Context Matters
o Why: While AlexNet was groundbreaking in 2012, its architecture is now 
considered outdated compared to more efficient and deeper models like VGG and 
MobileNet. It has fewer layers, lower accuracy, and lacks optimizations like 
depthwise separable convolutions.
o Drawback: Inefficiencies and limitations make it less competitive in scenarios 
where computational resources and accuracy are critical

"""
input_ids = tokenizer(inputt_text, return_tensors="pt").input_ids

output_ids = model.generate(input_ids, max_length=500, num_beams=4, early_stopping=True)
summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Summary:", summary)


Summary: a pixel value by 255.0, converting the pixel values from the range [0, 1] to [0, 1]. Output: Class probabilities for the given number of output classes Best for Resource-Constrained Devices: VGG • Why: VGG is a lightweight deep learning model designed for mobile and embedded devices, prioritizing efficiency and speed. Base Model: • VGG16 (pre-trained on ImageNet, without the top classification layers). Output Layer: A dense layer with num_classes units and softmax activation.


In [42]:
print(tokenizer.model_max_length)  # Shows max tokens supported

1000000000000000019884624838656

1000000000000000019884624838656


In [44]:
print(tokenizer.model_max_length)
print(model.config)


1000000000000000019884624838656
T5Config {
  "_attn_implementation_autoset": true,
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_sto

In [45]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the model and tokenizer
model_name = "t5-small"  # You can also try "t5-base" or "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Function to split text into chunks of 512 tokens
def chunk_text(text, tokenizer, chunk_size=512):
    tokens = tokenizer.encode(text, truncation=False)  # Tokenize without truncation
    return [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

# Function to summarize each chunk and combine them
def summarize_long_text(text, tokenizer, model):
    chunks = chunk_text(text, tokenizer)

    summaries = []
    for chunk in chunks:
        input_ids = torch.tensor([chunk])  # Convert chunk to tensor
        output_ids = model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    final_summary = " ".join(summaries)  # Merge all summaries
    return final_summary

# Example long text (Replace with your actual text)
long_text =  """First: Plant Classification  
Data Preparation 
1. Setting Constants
image_size: Defines the target size to which images will be resized (224x224 pixels).
batch_size: The number of images to process at once during training. (64 batches)
2. Loading Data
This function loads images and their corresponding labels from the specified directory.
3. Normalizing Images
The images are normalized by dividing each pixel's value by 255.0, converting the pixel values from 
the range [0, 255] to [0, 1]. This step helps improve the convergence during model training.
4. Splitting the Data into Train and Validation Sets
• This splits the loaded data into training and validation sets, with 80% of the data used for 
training and 20% for validation 
7. Label Encoding
• The LabelEncoder is used to convert text labels into integer labels 
8. Class Weight Calculation
• The class weights are computed to handle class imbalance. The compute_class_weight 
function calculates weights inversely proportional to class frequencies.
10. Data Augmentation
We apply augmentation only in MobileNet, VGG, AlexNet, as we don't apply augmentation in 
ViT. The train_datagen applies several data augmentation techniques to the training images, 
including: Random rotation (rotation_range), width/height shifting, shearing, zooming, and flipping 
the images horizontally. val_datagen doesn't apply any augmentation
These augmentations help improve model generalization by providing more varied input data.
11. Data Generators
train_generator and validation_generator and testing_generator are instances of 
ImageDataGenerator that yield batches of images and labels during training and validation, 
respectively
Vision Transformer (ViT)
Architecture
• Divides input images into fixed-size non-overlapping patches (e.g., 16×1616 \times 
1616x16).
• Converts patches into 1D vector embeddings via a linear projection.
• Adds positional embeddings to preserve spatial relationships.
• Processes embeddings using Transformer encoder layers (self-attention + feedforward 
networks).
• Uses a learnable [CLS] token for classification.
• Final output is passed to a classification head for tasks like image classification.
Advantages
• Scalability: Performs better with larger datasets.
• Global Context: Captures global relationships across the image.
• Flexibility: Can adapt to multi-modal tasks beyond vision (e.g., vision + text).
• Reduced Inductive Bias: Learns more adaptively compared to CNNs.
• Improved Performance: Outperforms CNNs on benchmarks when pre-trained on large 
datasets.
• Parallelization: Faster training due to sequence-level parallel processing.
• Transfer Learning: Pre-trained ViTs generalize well to other tasks.
Challenges
• Data Requirements: Needs large-scale datasets for effective training.
• Computational Cost: High memory and computation demands due to quadratic self attention complexity.
• Overfitting: Prone to overfitting on smaller datasets.
• Interpretability: Harder to interpret learned features compared to CNNs.MobileNet
MobileNet is a lightweight deep learning model designed for mobile and embedded devices, 
prioritizing efficiency and speed. It uses depthwise separable convolutions to reduce the number of 
parameters and computations. This architecture is well-suited for tasks like image classification 
and object detection on resource-constrained devices. Despite its simplicity, it achieves 
competitive accuracy compared to larger models.
Architecture:
Input: Images of size 224x224x3 (RGB).
Base Model:
• MobileNet (pre-trained on ImageNet, without the top classification layers).
• Lightweight and efficient architecture, designed with depthwise separable convolutions for 
reduced computational complexity.
• Base model layers are frozen (not trainable).
Custom Layers:
• Global Average Pooling (GAP): Reduces the spatial dimensions of the feature maps to a 
single vector for each channel, summarizing the spatial information globally.
• Dense Layer: Fully connected layer with 1024 units and ReLU activation.
• Dropout Layer: Dropout with a rate of 0.5 to reduce overfitting.
• Output Layer: Dense layer with num_classes units and softmax activation for classification.
Optimization:
• Uses Adam optimizer, categorical cross-entropy loss, and accuracy as a performance 
metric.
Output: Class probabilities for the given number of output classes 
 Best for Resource-Constrained Devices: MobileNet
• Why: MobileNet is optimized for efficiency and speed, making it ideal for mobile and 
embedded devices. Despite its smaller size, it delivers competitive performance on tasks 
like image classification and object detection.VGG16
VGG is a deep convolutional neural network known for its simplicity and uniform architecture, 
consisting of sequential 3x3 convolutional layers followed by fully connected layers. It comes in 
variations like VGG-16 and VGG-19, named for the number of layers. VGG models are 
computationally expensive but deliver high accuracy in image classification. Their deep and 
uniform structure has influenced the design of many subsequent models.
Architecture:
Input:
• Images of size 224x224x3 (RGB).
Base Model:
• VGG16 (pre-trained on ImageNet, without the top classification layers).
• Contains 13 convolutional layers grouped into 5 blocks, each followed by max-pooling 
layers for feature extraction.
Custom Layers:
• Flatten: Converts feature maps from VGG16 into a 1D vector.
• Dense Layer 1: Fully connected layer with 4096 units and ReLU activation.
• Dropout Layer 1: Dropout with a rate of 0.5 to reduce overfitting.
• Dense Layer 2: Fully connected layer with 4096 units and ReLU activation.
• Dropout Layer 2: Another dropout with a rate of 0.5.
• Output Layer: Dense layer with num_classes units and softmax activation for classification.
Optimization:
• Uses Adam optimizer, categorical cross-entropy loss, and accuracy as a performance 
metric.
 Output: Class probabilities for the given number of output classes 
Best for High Accuracy on Large Datasets: VGG
• Why: VGG models, particularly VGG-16 and VGG-19, provide high accuracy due to their 
deeper architecture and consistent design. They are well-suited for applications requiring 
precise feature extraction.
U-Net Model
U-Net is a convolutional neural network architecture specifically designed for biomedical image 
segmentation. It has a symmetrical encoder-decoder structure, where the encoder extracts 
features, and the decoder reconstructs the image with segmentation masks. Skip connections link 
corresponding layers in the encoder and decoder to preserve spatial information. U-Net is highly 
efficient and performs well on small datasets, making it a popular choice in medical imaging tasks.
Architecture
Define U-Net Blocks:
• Implemented a convolutional block (conv_block) that includes two convolutional layers 
with ReLU activation, kernel initialization, and dropout for regularization.
• Created an upsampling block (upsample_block) using transposed convolution for 
upsampling and concatenation of features from previous layers.
Contracting Path:
• Used sequential convolutional blocks (conv_block) and max-pooling layers to reduce 
spatial dimensions while increasing the number of feature channels:
• Encoder: Extracts and compresses features from the input (downsampling).
Expanding Path:
• Applied upsampling blocks to reconstruct spatial dimensions and combine features from 
the contracting path:
• Decoder: Reconstructs the spatial dimensions and combines extracted features 
• These stages are connected by the bottleneck layer (c5), which acts as the transition point 
between the encoder and decoder.
Output Layer:
• Added a final convolutional layer with 1 filter and sigmoid activation to produce a 
probability map for binary segmentation.
• Model Training:
• Defined callbacks for early stopping and saving the best model:
o EarlyStopping monitored validation loss with a patience of 5 epochs.
o ModelCheckpoint saved the best model during training.
Model Saving:
• Saved the trained model in HDF5 format (model.h5).
SAM Model
SAM is based on a foundation of transformer models, leveraging the power of attention 
mechanisms to learn spatial relationships within images for precise segmentation. SAM uses a 
vision transformer (ViT) as its backbone. Vision transformers have self-attention mechanisms that 
allow the model to capture long-range dependencies between pixels.
Architecture:
The main parts of the SAM architecture include:
• Backbone (Vision Transformer - ViT): This is the core architecture of SAM, where image 
features are extracted.
• Prompt Encoder: This component processes the different types of input prompts (points, 
boxes, and masks) to guide the segmentation.
• Segmentation Decoder: This part decodes the model’s predictions into final segmentation 
masks.
Dice Loss Advantages:
• Handling Imbalanced Data: Dice Loss is particularly useful when the dataset is 
imbalanced.
Second: Plant Disease Recognition
Siamese Architecture: A neural network designed to determine the similarity or dissimilarity 
between two inputs.
Twin Networks: Consists of two identical sub-networks that share the same weights and 
parameters.
Shared Weights: Both sub-networks learn the same features from the input data, ensuring 
consistent comparisons.
Distance Metric: Outputs (feature vectors) from the sub-networks are compared using a 
distance metric like Euclidean distance or cosine similarity.
Training: Network is trained with pairs of images labeled as similar or dissimilar, adjusting 
parameters to bring similar images closer and dissimilar ones farther apart.
Application: Commonly used in tasks such as plant recognition or image matching where 
pairwise comparisons are necessary
Advantages of One-shot Learning in Plant Recognition:
• Reduced Data Requirements: Recognizes plant species with just one image per species, 
reducing the need for large labeled datasets.
• Generalization: Effectively generalizes to new, unseen plant species, especially with 
models like Siamese or Prototypical Networks.
AlexNet
AlexNet is a pioneering deep learning model that popularized convolutional neural networks in the 
2012 ImageNet competition. It uses five convolutional layers, followed by three fully connected 
layers, and employs techniques like ReLU activation, dropout, and data augmentation. AlexNet 
significantly reduced error rates at the time and laid the foundation for modern deep learning in 
computer vision.
Architecture:
Input:
• Accepts images of size 224x224x3 (RGB).
Feature Extraction (Convolutional and Pooling Layers):
• 5 convolutional layers: filters with ReLU activation.Followed by MaxPooling
Flatten and Dense Layers:
• Flatten: Converts the extracted features into a 1D vector.
• Dense Layer 1 & Dense Layer 2: 4096 units, with ReLU activation.Followed by Dropout (rate 
0.5) to reduce overfitting.
Output Layer: A dense layer with num_classes units and softmax activation
Optimization:
• Uses Adam optimizer, categorical cross-entropy loss, and accuracy as a performance 
metric.
Output: Produces class probabilities for classification tasks 
Worst Model: Context Matters
o Why: While AlexNet was groundbreaking in 2012, its architecture is now 
considered outdated compared to more efficient and deeper models like VGG and 
MobileNet. It has fewer layers, lower accuracy, and lacks optimizations like 
depthwise separable convolutions.
o Drawback: Inefficiencies and limitations make it less competitive in scenarios 
where computational resources and accuracy are critical

"""

# Get final summary
summary = summarize_long_text(long_text, tokenizer, model)

print("Final Summary:")
print(summary)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2665 > 512). Running this sequence through the model will result in indexing errors


Final Summary:
training and validation, with 80% of the data used for training and validation. 20% for validation 7. Label Encoding • The LabelEncoder is used to convert text labels into integer labels 8. Class Weight Calculation • The class weights are computed to handle class imbalance. Advantages • Scalability: Performs better with larger datasets. . • Parallelization: Faster training due to sequence-level parallel processing. • Parallelization: Faster training due to sequence-level parallel processing. • Parallelization: Faster training due to sequence-level parallel processing. • Parallelization: Faster training due to sequence-level parallel processing. • Parallelization: Faster training due to sequence-level parallel processing. • Transfer Learning: Can adapt to multi-modal tasks beyond vision (e.g., vision + text). • Parallelization: Faster training due to sequence-level parallel layer extraction. • Output Layer 1: Dropout with a rate of 0.5 to reduce overfitting. • Output Laye