\

\

In [None]:
!pip install transformers datasets torch wandb



In [None]:
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset

# Load and clean the CSV file
data = pd.read_csv("/content/dataset.csv")
data["Description"] = data["Description"].fillna("").astype(str)
data["Summary"] = data["Summary"].fillna("").astype(str)

# Load the tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# Tokenize the data
train_encodings = tokenizer(
    data["Description"].tolist(),
    truncation=True,
    padding='max_length',
    max_length=512
)

summary_encodings = tokenizer(
    data["Summary"].tolist(),
    truncation=True,
    padding='max_length',
    max_length=128
)

# Create a Hugging Face Dataset
dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels]
        for labels in summary_encodings['input_ids']
    ]
})

# Split the dataset
split_datasets = dataset.train_test_split(test_size=0.1)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

# Load the model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    save_total_limit=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,1.254,1.23218
2,0.8977,1.32651
3,0.5703,1.344651


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=153, training_loss=1.0241728327632729, metrics={'train_runtime': 550.2521, 'train_samples_per_second': 1.101, 'train_steps_per_second': 0.278, 'total_flos': 656632694439936.0, 'train_loss': 1.0241728327632729, 'epoch': 3.0})

ImportError: cannot import name 'load_metric' from 'datasets' (/usr/local/lib/python3.10/dist-packages/datasets/__init__.py)

In [None]:
trainer.evaluate()

{'eval_loss': 1.2321795225143433,
 'eval_runtime': 1.7904,
 'eval_samples_per_second': 12.846,
 'eval_steps_per_second': 3.351,
 'epoch': 3.0}

In [None]:
model.save_pretrained('./fine_tuned_bart_email_summarization')
tokenizer.save_pretrained('./fine_tuned_bart_email_summarization')

('./fine_tuned_bart_email_summarization/tokenizer_config.json',
 './fine_tuned_bart_email_summarization/special_tokens_map.json',
 './fine_tuned_bart_email_summarization/vocab.json',
 './fine_tuned_bart_email_summarization/merges.txt',
 './fine_tuned_bart_email_summarization/added_tokens.json')

In [None]:

model = BartForConditionalGeneration.from_pretrained('./fine_tuned_bart_email_summarization')
tokenizer = BartTokenizer.from_pretrained('./fine_tuned_bart_email_summarization')


test_input = """
Dear Movie Fan, Relive the magic of cinema with our Classic Movie Marathons. Get your tickets for a back-to-back screening of your favorite old classics. Special discounts for group bookings! Regards, Cinema Promotions Team

"""
inputs = tokenizer(test_input, return_tensors="pt", truncation=True, padding=True)

summary_ids = model.generate(inputs['input_ids'], max_length=128, num_beams=4, early_stopping=True)
print("Summary:", tokenizer.decode(summary_ids[0], skip_special_tokens=True))

Summary: Announcement of movie marathons with group discounts.


In [None]:
# !pip install evaluate
# !pip install rouge_score
from transformers import BartForConditionalGeneration, BartTokenizer
import evaluate

model = BartForConditionalGeneration.from_pretrained('./fine_tuned_bart_email_summarization')
tokenizer = BartTokenizer.from_pretrained('./fine_tuned_bart_email_summarization')

test_input = """
 Dear all,

Instructions for Report Writing
I attach the mini-project certificates+toc and body separately.
You need to do them separately then merge the pdfs together. Use MS Word.

The report needs to be typed out by yourself from top to bottom. No copy pasting. Put installation instructions and syntax of Opengl Functions used in your project (the important ones) in the appendix.  In case of IP the syntax of OpenCV functions.

You need to get signatures from both Lab faculty members and HoD before bringing the report to externals.
 Show the draft copy during IaT week and get the corrections done. Hard deadline to get signatures: 31.07.2024

Mini projects review and report  (Show mini project review on or before 27.07.2024)
- CG projects -C1,C2, B3 - show to me (Preethi mam)
- IP projects - C1,C2 - show to Rajni mam, B3 -show to me (Preethi mam)

Report signatures should be taken from both the faculty members in the lab.

CG IP REGISTRATIONS


C1,C2 Batch
Dr. Preethi Sheba Hepsiba, Associate Professor,  Dept. of CSE
Prof. Sreedevi N., Assistant Professor, Dept. of CSE

B3 batch
Dr. Preethi Sheba Hepsiba, Associate Professor,  Dept. of CSE
Prof. Rajni Tiwari, Assistant Professor, Dept. of CSE


Number of copies: 1 copy per student

Cream color sheet, spiral binding, one sided printout in A4 sheet. Color printout for screenshots and front sheets.

Organization of report IP
1 INTRODUCTION
1.1 Problem Statement
1.2 Objectives
1.3 OpenCV Package
1.4 Jupyter Notebook

2 DESIGN
2.1 Dataflow / Flowchart
2.2 Component Design
- explain how API's are linked

3 IMPLEMENTATION
3.1 Dataset
3.2 Image Preprocessing
3.3 ...

4 CONCLUSION
 - Mention individual contribution and takeaways
 - what can be done further.
Organization of report CG

1 INTRODUCTION
1.1 Problem Statement
1.2 Functionalities
1.3 OpenGL Package
1.4 CodeBlocks IDE
  If you installed in VS Code or another IDE, then about that.
Installation instructions you can put in appendix.

2 DESIGN
2.1 Conceptual Design
2.2 Flowchart
2.3 Storyboard
2.4

3 IMPLEMENTATION
3. 1 Initialization
3.2 Modeling
3.3 Viewing
3.4 Interactions
Keyboard, Mouse, Menu
3.5 Window Management
"""
reference_summary = "Educational - Announcement about the details of report having installation instructions and syntax of OpenCV functions."

inputs = tokenizer(test_input, return_tensors="pt", truncation=True, padding=True)


summary_ids = model.generate(inputs['input_ids'], max_length=128, num_beams=4, early_stopping=True)
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)


print("Generated Summary:", generated_summary)

rouge = evaluate.load("rouge")


rouge_scores = rouge.compute(
    predictions=[generated_summary],
    references=[reference_summary]
)

print("ROUGE Scores:")
for key, value in rouge_scores.items():
    print(f"{key}: {value:.4f}")

final_accuracy = max(rouge_scores["rouge1"], rouge_scores["rouge2"], rouge_scores["rougeL"])
print(f"Final Accuracy: {final_accuracy:.4f}")


Generated Summary: Educational - Announcement about the details of the report, including installation instructions and syntax of OpenCV functions.
ROUGE Scores:
rouge1: 0.9032
rouge2: 0.7586
rougeL: 0.9032
rougeLsum: 0.9032
Final Accuracy: 0.9032


In [None]:
from datasets import load_metric

# Load ROUGE metric
rouge_metric = load_metric("rouge")

# Function to calculate metrics
def calculate_metrics(reference, prediction):
    # Use the ROUGE metric
    results = rouge_metric.compute(
        predictions=[prediction],
        references=[reference],
        use_stemmer=True
    )
    # Extract and format ROUGE scores
    formatted_scores = {key: value.mid.fmeasure * 100 for key, value in results.items()}
    return formatted_scores


ImportError: cannot import name 'load_metric' from 'datasets' (/usr/local/lib/python3.10/dist-packages/datasets/__init__.py)

  adding: fine_tuned_bart/ (stored 0%)
  adding: fine_tuned_bart/tokenizer_config.json (deflated 76%)
  adding: fine_tuned_bart/special_tokens_map.json (deflated 85%)
  adding: fine_tuned_bart/merges.txt (deflated 53%)
  adding: fine_tuned_bart/vocab.json (deflated 68%)
  adding: fine_tuned_bart/model.safetensors (deflated 7%)
  adding: fine_tuned_bart/config.json (deflated 63%)
  adding: fine_tuned_bart/generation_config.json (deflated 47%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd

# Read the CSV file
data = pd.read_csv("/content/dataset.csv")

# Fill missing values and convert to strings
data["Description"] = data["Description"].fillna("").astype(str)
data["Summary"] = data["Summary"].fillna("").astype(str)

# Count the available (non-null) results
description_count = data["Description"].str.strip().ne("").sum()
summary_count = data["Summary"].str.strip().ne("").sum()

description_count, summary_count


(211, 209)