# Colab output wrapper

In [1]:
# wrap the output in colab cells
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

# Install Transformers

In [2]:
# install transformers with sentencepiece
!pip install transformers[sentencepiece]



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Read input file from Google Drive

In [4]:
# open and read the file from google drive
file = open("/content/drive/MyDrive/NM/isro.txt", "r")
FileContent = file.read().strip()

In [5]:
# display file content
FileContent

"Indian Space Research Organisation (ISRO) is the space agency of India. The organisation is involved in science, engineering and technology to harvest the benefits of outer space for India and the mankind. ISRO is a major constituent of the Department of Space (DOS), Government of India. The department executes the Indian Space Programme primarily through various Centres or units within ISRO.\n\nISRO was previously the Indian National Committee for Space Research (INCOSPAR), set up by the Government of India in 1962, as envisioned by Dr. VikramA Sarabhai. ISRO was formed on August 15, 1969 and superseded INCOSPAR with an expanded role to harness space technology. DOS was set up and ISRO was brought under DOS in 1972.\n\nThe prime objective of ISRO/DOS is the development and application of space technology for various national needs. To fulfil this objective, ISRO has established major space systemsfor communication, television broadcasting and meteorological services; resources monito

In [6]:
# total characters in the file
len(FileContent)

2764

# Load the Model and Tokenizer

In [7]:
# import and initialize the tokenizer and model from the checkpoint
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint = "sshleifer/distilbart-cnn-12-6"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

# Some model statistics

In [8]:
# max tokens including the special tokens
tokenizer.model_max_length

1024

In [9]:
# max tokens excluding the special tokens
tokenizer.max_len_single_sentence

1022

In [10]:
# number of special tokens
tokenizer.num_special_tokens_to_add()

2

# Convert file content to sentences

In [11]:
# extract the sentences from the document
import nltk
nltk.download('punkt')
sentences = nltk.tokenize.sent_tokenize(FileContent)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
# find the max tokens in the longest sentence
max([len(tokenizer.tokenize(sentence)) for sentence in sentences])

157

# Create the chunks

In [13]:
# initialize
length = 0
chunk = ""
chunks = []
count = -1
for sentence in sentences:
  count += 1
  combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

  if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
    chunk += sentence + " " # add the sentence to the chunk
    length = combined_length # update the length counter

    # if it is the last sentence
    if count == len(sentences) - 1:
      chunks.append(chunk.strip()) # save the chunk

  else:
    chunks.append(chunk.strip()) # save the chunk

    # reset
    length = 0
    chunk = ""

    # take care of the overflow sentence
    chunk += sentence + " "
    length = len(tokenizer.tokenize(sentence))
len(chunks)

1

# Some checks

In [14]:
[len(tokenizer.tokenize(c)) for c in chunks]

[536]

In [15]:
[len(tokenizer(c).input_ids) for c in chunks]

[538]

## With special tokens added

In [16]:
sum([len(tokenizer(c).input_ids) for c in chunks])

538

In [17]:
len(tokenizer(FileContent).input_ids)

548

## Without special tokens added

In [18]:
sum([len(tokenizer.tokenize(c)) for c in chunks])

536

In [19]:
len(tokenizer.tokenize(FileContent))

546

# Get the inputs

In [20]:
# inputs to the model
inputs = [tokenizer(chunk, return_tensors="pt") for chunk in chunks]

# Output

In [21]:
for input in inputs:
  output = model.generate(**input)
  print(tokenizer.decode(*output, skip_special_tokens=True))

 ISRO is a major constituent of the Department of Space (DOS), Government of India. ISRO was previously the Indian National Committee for Space Research (INCOSPAR) The prime objective of ISRO/DOS is the development and application of space technology for various national needs.
