# Colab output wrapper

In [1]:
# wrap the output in colab cells
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
# get_ipython().events.register('pre_run_cell', set_css)

# Install Transformers

In [26]:
# install transformers with sentencepiece
# !pip install transformers[sentencepiece]



# Read input file from Google Drive

In [4]:
filepath = '../data/db_02-03-2023/output.txt'

with open(filepath, 'r') as f:
    FileContent = f.read()

In [5]:
# display file content
FileContent 

"\n\nCS12_4510:\n Now recording!\n\nJadePixie_7138:\n Oh no, he didn't say the thing!\n\nCrux_4429:\n I took away his permissions to say things.\n\nJadePixie_7138:\n But I like to hear him say, now recording.  It brought me comfort and encouragement.\n\nCS12_4510:\n You know Josh, you could have just right clicked him and done a server mute, but this works too! Anyway, welcome.\n\nJadePixie_7138:\n What if Craig can't hear me?\n\nCS12_4510:\n How would he be able to tell me? \n\nJadePixie_7138:\n Craig, if you can't hear me, you gotta say something. \n\nCS12_4510:\n Oh, well it sounds like he can hear you.\n\nJadePixie_7138:\n Uh, okay. \n\nIlMaximuslI_9218:\n I believe you. Hello friends.\n\nJadePixie_7138:\n I\n\nCrux_4429:\n Great.  Hello.\n\nIlMaximuslI_9218:\n What are we doing for down below?\n\nCS12_4510:\n Alllll below.\n\nJadePixie_7138:\n have recapping, I guess, because we've got multiple quests that we can go on. We have just been informed that war is coming to Kirk's base 

In [6]:
# total characters in the file
len(FileContent) 

106689

# Load the Model and Tokenizer

In [7]:
# import and initialize the tokenizer and model from the checkpoint
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint = "sshleifer/distilbart-cnn-12-6"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

# Some model statistics

In [8]:
# max tokens including the special tokens
tokenizer.model_max_length 

1024

In [9]:
# max tokens excluding the special tokens
tokenizer.max_len_single_sentence 

1022

In [10]:
# number of special tokens
tokenizer.num_special_tokens_to_add() 

2

# Convert file content to sentences

In [11]:
# extract the sentences from the document
import nltk
nltk.download('punkt')
sentences = nltk.tokenize.sent_tokenize(FileContent)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Crux\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
# find the max tokens in the longest sentence
max([len(tokenizer.tokenize(sentence)) for sentence in sentences])

148

# Create the chunks

In [13]:
# initialize
length = 0
chunk = ""
chunks = []
count = -1
for sentence in sentences:
  count += 1
  combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

  if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
    chunk += sentence + " " # add the sentence to the chunk
    length = combined_length # update the length counter

    # if it is the last sentence
    if count == len(sentences) - 1:
      chunks.append(chunk.strip()) # save the chunk
    
  else: 
    chunks.append(chunk.strip()) # save the chunk
    
    # reset 
    length = 0 
    chunk = ""

    # take care of the overflow sentence
    chunk += sentence + " "
    length = len(tokenizer.tokenize(sentence))
len(chunks)

31

# Some checks

In [14]:
[len(tokenizer.tokenize(c)) for c in chunks]

[990,
 996,
 1012,
 995,
 995,
 1010,
 995,
 1003,
 997,
 1018,
 1005,
 992,
 1013,
 998,
 1000,
 992,
 1013,
 983,
 1001,
 1005,
 991,
 1003,
 1006,
 1003,
 980,
 1012,
 1013,
 991,
 1010,
 1000,
 976]

In [57]:
[len(tokenizer(c).input_ids) for c in chunks]

[1016, 986, 962, 1005, 1022, 990, 998, 1000, 1019, 578]

## With special tokens added

In [50]:
sum([len(tokenizer(c).input_ids) for c in chunks])

9576

In [51]:
len(tokenizer(FileContent).input_ids)

Token indices sequence length is longer than the specified maximum sequence length for this model (9561 > 1024). Running this sequence through the model will result in indexing errors


9561

## Without special tokens added

In [54]:
sum([len(tokenizer.tokenize(c)) for c in chunks])

9556

In [55]:
len(tokenizer.tokenize(FileContent))

9559

# Get the inputs

In [15]:
# inputs to the model
inputs = [tokenizer(chunk, return_tensors="pt") for chunk in chunks]

# Output

In [16]:
for input in inputs:
  output = model.generate(**input)
  print(tokenizer.decode(*output, skip_special_tokens=True))



 JadePixie_7138: "We've been gone from existence for the past slightly over a year. Ezreal's mind is just getting saw into infinity. Those memories cannot be deleted as of right now. So Ezreal has a virus and we have to run antivirus software at some point. We're gonna have to clear some things. I don't know how this happened. How did you delete Windows?"
 "Smeeple" is back at the center of a new episode of the series. Chaya is doing flight training on a giant eagle. She introduces you to his great eagle friend who he rides around on and  you can honestly tell Vakumi that he is thinking about saying friend and then decides to say this is my, uh, this is, my, my mount. It's really cool. He's a very smart bird. It is the smartest bird, but giant eagles are damn smart.
 JadePixie: I just glare at this giant eagle that I'm having a silent conversation with. You hear Chaya say things in secret because of your advanced telepathy, where it says, uh, like, yeah, be nice. She's very angry, but 

KeyboardInterrupt: 