<a href="https://colab.research.google.com/github/GinuraRansika/kcroz-ml-part/blob/main/kcroz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q
!pip install onnx
!pip install onnx-tf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 KB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 KB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m107.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m52.4 MB/s[0m et

In [2]:
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt

import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
from datasets import load_dataset

In [4]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [5]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, 
                                metric, 
                                model, 
                                tokenizer,
                                batch_size=16, 
                                device=device,
                                column_text="article",
                                column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
        # Finally, we decode the generated texts, 
        # replace the <n> token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [6]:
dataset_dailymail = load_dataset("cnn_dailymail", '3.0.0')
split_lengths = [len(dataset_dailymail[split]) for split in dataset_dailymail]

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_dailymail['train'].column_names}")
print("\nArticle:")

print(dataset_dailymail["test"][0]["article"])

print("\nHighlights:")

print(dataset_dailymail["test"][0]["highlights"])

Split lengths: [287113, 13368, 11490]
Features: ['article', 'highlights', 'id']

Article:
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the 

In [8]:
pipe = pipeline('summarization', model = model_ckpt )



In [9]:
# pipe_out = pipe(dataset_dailymail['test'][0:8]['article'])
# print(pipe_out)

In [None]:
# print(pipe_out[0]['summary_text'].replace(".<n>", ".\n")) # replace <n> with new line

In [None]:
# Calculate the Rogue matrix
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]



In [None]:
rouge_metric = load_metric('rouge')

In [10]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['article'] , max_length = 800, truncation = True )
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['highlights'], max_length = 70, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }
# Limit the size of the training and validation splits
dataset_dailymail['train'] = dataset_dailymail['train'].select(range(1000))
dataset_dailymail['validation'] = dataset_dailymail['validation'].select(range(1000))
dataset_dailymail_pt = dataset_dailymail.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [11]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [12]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus-kcroz', 
    num_train_epochs=1,
    warmup_steps=500,
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,
    weight_decay=0.01, 
    logging_steps=10,
    evaluation_strategy='steps', 
    eval_steps=500, 
    save_steps=1e6,
    gradient_accumulation_steps=16
) 

In [13]:
print(len(dataset_dailymail_pt["train"]))
print(len(dataset_dailymail_pt["validation"]))


1000
1000


In [14]:
trainer = Trainer(model=model_pegasus, 
                  args=trainer_args,
                  tokenizer=tokenizer, 
                  data_collator=seq2seq_data_collator,
                  train_dataset=dataset_dailymail_pt["train"], 
                  eval_dataset=dataset_dailymail_pt["validation"])

In [15]:
trainer.train()

You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=62, training_loss=1.8759649492079211, metrics={'train_runtime': 416.7008, 'train_samples_per_second': 2.4, 'train_steps_per_second': 0.149, 'total_flos': 1725972469506048.0, 'train_loss': 1.8759649492079211, 'epoch': 0.99})

In [16]:
trainer.save_model("kcrozSummerizationModel")

In [None]:
trainer.evaluate()

{'eval_loss': 2.0833399295806885,
 'eval_runtime': 148.8708,
 'eval_samples_per_second': 6.717,
 'eval_steps_per_second': 6.717,
 'epoch': 0.99}

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential he

In [None]:
trainer.push_to_hub()

OSError: ignored

Trying

In [None]:
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}



sample_text = "My name is ginura, I am a 13-year-old and I live with my parents in Ghaziabad, Uttar Pradesh. I’m a single child of my parents. I study in class 8th at Sant Mary Convent School. My father is Mr. Alok Nath and he is a reputed doctor. My mother is Mrs. Jyoti Nath and she is also a doctor. I am an adventurous person who likes to take on challenges. My hobby is dancing. I also go to a dancing class after school. My parents are my biggest strength. They always support me and guide me to take important decisions in life. There have been a number of experiments and achievements in my life. I always participate in school activities. After my studies, I want to become a doctor like my parents. However, I already started preparing for different competitive exams. I’m very punctual and attentive in life. I never get late for school. I always prefer to do my work on time. I'm very good at managing my time and maintaining discipline in my life. As much as I can, I always try to help others and solve their problems to the best of my ability. Whenever they need support and assistance, I give them advice and suggestions when they ask for it."

reference = "MYSElf"

pipe = pipeline("summarization", model="kcrozSummerizationModel")

## 
print("Dialogue:")
print(sample_text)


print("\nReference highlights:")
print(reference)


print("\nModel highlights:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

Dialogue:
My name is ginura, I am a 13-year-old and I live with my parents in Ghaziabad, Uttar Pradesh. I’m a single child of my parents. I study in class 8th at Sant Mary Convent School. My father is Mr. Alok Nath and he is a reputed doctor. My mother is Mrs. Jyoti Nath and she is also a doctor. I am an adventurous person who likes to take on challenges. My hobby is dancing. I also go to a dancing class after school. My parents are my biggest strength. They always support me and guide me to take important decisions in life. There have been a number of experiments and achievements in my life. I always participate in school activities. After my studies, I want to become a doctor like my parents. However, I already started preparing for different competitive exams. I’m very punctual and attentive in life. I never get late for school. I always prefer to do my work on time. I'm very good at managing my time and maintaining discipline in my life. As much as I can, I always try to help other

In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from huggingface_hub import upload_folder