# Colab output wrapper

In [1]:
# wrap the output in colab cells
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
# get_ipython().events.register('pre_run_cell', set_css)

# Install Transformers

In [26]:
# install transformers with sentencepiece
# !pip install transformers[sentencepiece]



# Read input file from Google Drive

In [1]:
filepath = '../data/db_02-03-2023/corpus.txt'

with open(filepath, 'r') as f:
    FileContent = f.read()

In [29]:
corpus = FileContent

In [2]:
# display file content
FileContent 

"\n Now recording!\n Oh no, he didn't say the thing!\n I took away his permissions to say things.\n But I like to hear him say, now recording.  It brought me comfort and encouragement.\n You know Josh, you could have just right clicked him and done a server mute, but this works too! Anyway, welcome.\n What if Craig can't hear me?\n How would he be able to tell me? \n Craig, if you can't hear me, you gotta say something. \n Oh, well it sounds like he can hear you.\n Uh, okay. \n I believe you. Hello friends.\n I\n Great.  Hello.\n What are we doing for down below?\n Alllll below.\n have recapping, I guess, because we've got multiple quests that we can go on. We have just been informed that war is coming to Kirk's base and that a bunch of Sha'a Ka'al people are disappearing down on Rhaegis.   Myla found a city in a bottle and somewhere inside the city in the bottle there is a powerful item of sorts that can protect us from undead.  And also we've been gone from, we've been, you know, gon

In [3]:
# total characters in the file
len(FileContent) 

90772

# Load the Model and Tokenizer

In [31]:
# import and initialize the tokenizer and model from the checkpoint
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint = "google/pegasus-xsum"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Some model statistics

In [32]:
# max tokens including the special tokens
tokenizer.model_max_length 

512

In [33]:
# max tokens excluding the special tokens
tokenizer.max_len_single_sentence 

511

In [38]:
# number of special tokens
tokenizer.num_special_tokens_to_add() 

1

# Convert file content to sentences

In [39]:
# extract the sentences from the document
import nltk
nltk.download('punkt')
sentences = nltk.tokenize.sent_tokenize(FileContent)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Crux\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
# find the max tokens in the longest sentence
max([len(tokenizer.tokenize(sentence)) for sentence in sentences])

105

# Create the chunks

In [41]:
# initialize
length = 0
chunk = ""
chunks = []
count = -1
for sentence in sentences:
  count += 1
  combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

  if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
    chunk += sentence + " " # add the sentence to the chunk
    length = combined_length # update the length counter

    # if it is the last sentence
    if count == len(sentences) - 1:
      chunks.append(chunk.strip()) # save the chunk
    
  else: 
    chunks.append(chunk.strip()) # save the chunk
    
    # reset 
    length = 0 
    chunk = ""

    # take care of the overflow sentence
    chunk += sentence + " "
    length = len(tokenizer.tokenize(sentence))
len(chunks)

45

# Some checks

In [42]:
[len(tokenizer.tokenize(c)) for c in chunks]

[499,
 481,
 491,
 506,
 474,
 503,
 511,
 510,
 502,
 505,
 480,
 508,
 505,
 446,
 506,
 506,
 508,
 505,
 509,
 502,
 496,
 509,
 502,
 482,
 510,
 467,
 508,
 510,
 503,
 467,
 507,
 471,
 496,
 472,
 487,
 503,
 486,
 504,
 475,
 453,
 511,
 499,
 509,
 497,
 394]

In [43]:
[len(tokenizer(c).input_ids) for c in chunks]

[500,
 482,
 492,
 507,
 475,
 504,
 512,
 511,
 503,
 506,
 481,
 509,
 506,
 447,
 507,
 507,
 509,
 506,
 510,
 503,
 497,
 510,
 503,
 483,
 511,
 468,
 509,
 511,
 504,
 468,
 508,
 472,
 497,
 473,
 488,
 504,
 487,
 505,
 476,
 454,
 512,
 500,
 510,
 498,
 395]

## With special tokens added

In [44]:
sum([len(tokenizer(c).input_ids) for c in chunks])

22220

In [45]:
len(tokenizer(FileContent).input_ids)

Token indices sequence length is longer than the specified maximum sequence length for this model (22176 > 512). Running this sequence through the model will result in indexing errors


22176

## Without special tokens added

In [46]:
sum([len(tokenizer.tokenize(c)) for c in chunks])

22175

In [47]:
len(tokenizer.tokenize(FileContent))

22175

# Get the inputs

In [48]:
# inputs to the model
inputs = [tokenizer(chunk, return_tensors="pt") for chunk in chunks]

# Output

In [None]:
# for input in inputs:
#   output = model.generate(**input, min_length=20, max_length=60)
#   print(tokenizer.decode(*output, skip_special_tokens=True))

In [54]:
def summarize(input):
    sentences = nltk.tokenize.sent_tokenize(input)
    length = 0
    chunk = ""
    chunks = []
    count = -1
    for sentence in sentences:
        count += 1
        combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

        if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
            chunk += sentence + " " # add the sentence to the chunk
            length = combined_length # update the length counter

            # if it is the last sentence
            if count == len(sentences) - 1:
                chunks.append(chunk.strip()) # save the chunk
            
        else: 
            chunks.append(chunk.strip()) # save the chunk
            
            # reset 
            length = 0 
            chunk = ""

            # take care of the overflow sentence
            chunk += sentence + " "
            length = len(tokenizer.tokenize(sentence))
    inputs = [tokenizer(chunk, return_tensors="pt") for chunk in chunks]
    outputs = list()    
    for input in inputs:
        output = model.generate(**input, min_length=2, max_length=200)
        summed_text = tokenizer.decode(*output, skip_special_tokens=True)
        print(summed_text)
        outputs.append(summed_text)

    return outputs

In [55]:
t2 = ''.join(summarize(corpus))

In this week's episode of Star Trek: The Next Generation, Sam, Myla, Ezreal, and Craig all recap the events of last week's episode and answer your questions. learned that let's see Yart is the new supreme commander and like this is the gift wrap for Matt Matt I believe and we've got some of people we're trying to hope they be free again by being able to go out of the bathroom and yeah But if they go, fuck ton of psychic damage.
In this week's episode of The Big Bang Theory, we find out that Miley's husband is a Githyanki, learn that Ailiyelidusti is Maile, and find out that Yankee is too big for a giant eagle.
Vakumi and Chaya are back.


KeyboardInterrupt: 