In [1]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import nltk
from nltk.tokenize import BlanklineTokenizer
import spacy

nltk.download('punkt')
nlp = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/ignisda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Text Summarizer

In [2]:
path = './data' # 'google/pegasus-cnn_dailymail'

In [3]:
tokenizer = PegasusTokenizer.from_pretrained(path)

In [4]:
textmodel = PegasusForConditionalGeneration.from_pretrained(path)

In [24]:
def getParagraphs(text):
  return BlanklineTokenizer().tokenize(text)


def getTokens(paragraph):
  return tokenizer(paragraph, truncation=True, padding='longest', return_tensors='pt')


def summarize(tokens):
  summary = textmodel.generate(**tokens)
  summary = tokenizer.decode(summary[0])
  return summary


def generateVideoTokens(transcript):
  doc = nlp(transcript)
  sentences = []
  for sent in doc.sents:
    sentences.append(sent.text)
  
  videotokens = []
  for i in range(0, len(sentences), 5):
    videotokens.append(getTokens(sentences[i:i+5]))
  return videotokens

In [6]:
text = """
The giraffe is a large African hoofed mammal belonging to the genus Giraffa. It is the tallest living terrestrial animal and the largest ruminant on Earth. Traditionally, giraffes were thought to be one species, Giraffa camelopardalis, with nine subspecies. Most recently, researchers proposed dividing them into up to eight extant species due to new research into their mitochondrial and nuclear DNA, as well as morphological measurements. Seven other extinct species of Giraffa are known from the fossil record.

The giraffe's chief distinguishing characteristics are its extremely long neck and legs, its horn-like ossicones, and its spotted coat patterns. It is classified under the family Giraffidae, along with its closest extant relative, the okapi. Its scattered range extends from Chad in the north to South Africa in the south, and from Niger in the west to Somalia in the east. Giraffes usually inhabit savannahs and woodlands. Their food source is leaves, fruits, and flowers of woody plants, primarily acacia species, which they browse at heights most other herbivores cannot reach.

Lions, leopards, spotted hyenas, and African wild dogs may prey upon giraffes. Giraffes live in herds of related females and their offspring, or bachelor herds of unrelated adult males, but are gregarious and may gather in large aggregations. Males establish social hierarchies through "necking", combat bouts where the neck is used as a weapon. Dominant males gain mating access to females, which bear sole responsibility for raising the young.
2
The giraffe has intrigued various ancient and modern cultures for its peculiar appearance, and has often been featured in paintings, books, and cartoons. It is classified by the International Union for Conservation of Nature (IUCN) as vulnerable to extinction and has been extirpated from many parts of its former range. Giraffes are still found in numerous national parks and game reserves, but estimates as of 2016 indicate there are approximately 97,500 members of Giraffa in the wild. More than 1,600 were kept in zoos in 2010. 

The name "giraffe" has its earliest known origins in the Arabic word zarāfah (زرافة),[2] perhaps borrowed from the animal's Somali name geri.[3] The Arab name is translated as "fast-walker".[4] In early Modern English the spellings jarraf and ziraph were used, probably directly from the Arabic,[5] and in Middle English jarraf and ziraph, gerfauntz.
The Italian form giraffa arose in the 1590s. 
"""

In [7]:
paragraphs = getParagraphs(text)
paragraphs

['\nThe giraffe is a large African hoofed mammal belonging to the genus Giraffa. It is the tallest living terrestrial animal and the largest ruminant on Earth. Traditionally, giraffes were thought to be one species, Giraffa camelopardalis, with nine subspecies. Most recently, researchers proposed dividing them into up to eight extant species due to new research into their mitochondrial and nuclear DNA, as well as morphological measurements. Seven other extinct species of Giraffa are known from the fossil record.',
 "The giraffe's chief distinguishing characteristics are its extremely long neck and legs, its horn-like ossicones, and its spotted coat patterns. It is classified under the family Giraffidae, along with its closest extant relative, the okapi. Its scattered range extends from Chad in the north to South Africa in the south, and from Niger in the west to Somalia in the east. Giraffes usually inhabit savannahs and woodlands. Their food source is leaves, fruits, and flowers of wo

In [8]:
tokens = []
for para in paragraphs:
  tokens.append(getTokens(para))

tokens

[{'input_ids': tensor([[  139, 36340,   117,   114,   423,  2636, 46001,   316, 47974, 10857,
            112,   109, 23737, 27418, 20307,   304,   107,   168,   117,   109,
          22246,   622, 29715,  2517,   111,   109,  1368,   110,  7882, 62333,
            124,  2774,   107, 28610,   108, 36340,   116,   195,   666,   112,
            129,   156,  2398,   108, 27418, 20307,   304,   642, 25697,  6559,
          21175,   108,   122,  2899, 62910,   107,  1386,   938,   108,  2995,
           2962, 20179,   183,   190,   164,   112,  1965, 52808,  2398,   640,
            112,   177,   473,   190,   153, 36546,   111,  4573,  5488,   108,
            130,   210,   130, 52687,  6614,   107,  9541,   176, 27478,  2398,
            113, 27418, 20307,   304,   127,   606,   135,   109, 11881,  1093,
            107,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [9]:
for i in tokens:
  print(summarize(i))



The giraffe is a large African hoofed mammal belonging to the genus Giraffa.<n>It is the tallest living terrestrial animal and the largest ruminant on Earth.
It is classified under the family Giraffidae, along with its closest extant relative, the okapi.<n>Its scattered range extends from Chad in the north to South Africa in the south, and from Niger in the west to Somalia in the east.
Giraffes live in herds of related females and their offspring, or bachelor herds of unrelated adult males.<n>Males establish social hierarchies through "necking", combat bouts where the neck is used as a weapon.<n>Giraffes are still found in numerous national parks and game reserves, but estimates as of 2016 indicate there are approximately 97,500 members of Giraffa in the wild.
The name "giraffe" has its earliest known origins in the Arabic word zar ⁇ fah ( ⁇ ), perhaps borrowed from the animal's Somali name geri.<n>The Arab name is translated as "fast-walker"<n>The Italian form giraffa arose in the 159

## Video Summarizer


In [10]:
from youtube_transcript_api import YouTubeTranscriptApi

In [11]:
def getTranscript(transcript):
  subtitles = ""
  for words in transcript:
    subtitles += words['text']
    subtitles += " "
  return subtitles

In [26]:
input_video = 'https://www.youtube.com/watch?v=5C_HPTJg5ek'
iv = input_video.split('=')[1]

In [28]:
transcript = YouTubeTranscriptApi.get_transcript(iv)
transcript = getTranscript(transcript)

In [29]:
videotokens = generateVideoTokens(transcript)

In [30]:
for tokens in videotokens:
  print(summarize(tokens))

rust is a compiled programming language that delivers high-level simplicity with low-level performance.<n> rust achieves memory safety with a concept known as ownership and borrowing by default every variable in rust is immutable.<n> rust is a popular choice for building systems where performance is absolutely critical like game engines databases or operating systems.
