<a href="https://colab.research.google.com/github/Nanditha-V/T5_NLP/blob/master/T5_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install transformers



In [4]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import time

model = T5ForConditionalGeneration.from_pretrained('t5-small')  #tiny-base
tokenizer = T5Tokenizer.from_pretrained('t5-small')  #tiny-base

start_time = time.time()

def summarization_infer(text, max=50):
  preprocess_text = text.replace("\n", " ").strip()
  t5_prepared_Text = "summarize: "+preprocess_text
  tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt")

  summary_ids = model.generate(tokenized_text, min_length=30, max_length=max, top_k=100, top_p=0.8) #top-k top-p sampling strategy
  output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
  end_time = time.time()
  print (f'Time taken : {end_time-start_time}')
  return output

def translation_infer(text, max=50):
  preprocess_text = text.replace("\n", " ").strip()
  t5_prepared_Text = "translate English to Dutch: "+preprocess_text
  tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt")

  translation_ids = model.generate(tokenized_text, min_length=10, max_length=50, early_stopping=True, num_beams=2)
  output = tokenizer.decode(translation_ids[0], skip_special_tokens=True)
  end_time = time.time()
  print (f'Time taken : {end_time-start_time}')
  return output

def grammatical_acceptibility_infer(text):
  preprocess_text = text.replace("\n", " ").strip()
  t5_prepared_Text = "cola sentence: "+preprocess_text
  tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt")

  grammar_ids = model.generate(tokenized_text, min_length=1, max_length=3)
  output = tokenizer.decode(grammar_ids[0], skip_special_tokens=True)
  end_time = time.time()
  print (f'Time taken : {end_time-start_time}')
  return output

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [3]:
text ="""Nuclear weapons probably killed more than 200,000 people in Hiroshima and Nagasaki in 1945, claimed many more lives from cancer in the years that followed, generated decades of profound anxiety during the Cold War and brought the world to the brink of annihilation during the Cuban Missile crisis in 1962. They have also changed the calculations of national leaders on how to respond to international aggression, as currently playing out with Russia’s invasion of Ukraine."""

In [4]:
from transformers import pipeline
summarization_pipeline = pipeline(task='summarization', model="t5-small")
output = summarization_pipeline(text, min_length=30, max_length=50, top_k=100, top_p=0.8)
print (output)

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

[{'summary_text': 'nuclear weapons probably killed more than 200,000 people in Hiroshima and Nagasaki in 1945 . they claimed many more lives from cancer in the years that followed .'}]


In [5]:
summarization_infer(text)

Time taken : 32.09776782989502


'nuclear weapons probably killed more than 200,000 people in 1945. they killed more than 200,000 people in 1945. they have also changed the calculations of national leaders on how to respond to international aggression.'

In [6]:
translation_infer(text)

Time taken : 100.4273030757904


'Die niederländischen Atomwaffen haben 1945 in Hiroshima und Nagasaki wahrscheinlich mehr als 200 000 Menschen getötet, in den darauf folgenden Jahren viel mehr Menschenleben an Krebs gefordert, Jahrzehnte'

In [7]:
grammatical_acceptibility_infer(text)

Time taken : 129.91883063316345


'acceptable'