In [1]:
from typing import List
try:
    import spacy
except:
    %pip install spacy
    import spacy

try:
    from sentence_transformers import CrossEncoder
except:
    %pip install sentence-transformers
    from sentence_transformers import CrossEncoder
    
try:
    from transformers import pipeline
except:
    %pip install transformers
    from transformers import pipeline

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import MarianMTModel, MarianTokenizer
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

try:
    import torch
except:
    %pip install torch
    import torch
    
try: 
    import PyPDF2
except:
    %pip install pypdf
    import PyPDF2

try:
    from llama_index import SimpleDirectoryReader
except:     
    %pip install llama-index
    from llama_index import SimpleDirectoryReader
    
#     !pip install GPT4All
#     !pip install einops

device = 'cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


# Text Segmentation Tools

In [2]:
nlp_es = spacy.load('es_core_news_lg')
nlp = spacy.load('en_core_web_sm')

def filter_tokens(doc):
  # doc = nlp(doc)
  # filtered_tokens = [token.text for token in doc if not token.is_oov]
  # return " ".join(filtered_tokens)
  return doc
  
def remove_stop_words(text: str) -> str:
  # doc = nlp(text)
  # text_parts = [token.text for token in doc if not token.is_stop]
  # return filter_tokens(" ".join(text_parts))
  
  return text

def split_paragraphs(text: str) -> List[str]:
  paragraphs = []
  doc = nlp(text)
  current_paragraph = []
  for token in doc:
    # Check if the token is a newline character (end of paragraph)
    if "\n" in token.text:
        # Append the current paragraph to the list of paragraphs
        # if len(token.text) > 1:
        paragraphs.append(" ".join(current_paragraph))
        current_paragraph = []
        # Reset the current paragraph
    else:
        # Append the token's text to the current paragraph
        current_paragraph.append(token.text)
        
  paragraphs.append(current_paragraph)
  return paragraphs


def split_sentences(text: list[str]) -> List[str]:
  text = " ".join(text)
  doc = nlp(text)
  sentences = [sent.text for sent in doc.sents]
  return sentences

def group_sentences_semantically(sentences: List[str], threshold: int) -> List[str]:
  docs = [nlp(sentence) for sentence in sentences]
  
  segments = []

  start_idx = 0
  end_idx = 1
  segment = [sentences[start_idx]]
  model = CrossEncoder('cross-encoder/stsb-roberta-large')
  
  while end_idx < len(docs):
    similarity = model.predict([[str(docs[start_idx]), str(docs[end_idx])]])
    if similarity >= threshold:
      segment.append(str(docs[end_idx]))
    else:
      segments.append(" ".join(segment))
      start_idx = end_idx
      segment = [sentences[start_idx]]
    end_idx += 1

  if segment:
    segments.append(" ".join(segment))
  
  del model
  
  return segments

def split_text(text: str) -> List[str]:
  text_no_stop_words = remove_stop_words(text)
  # sentences = split_paragraphs(text_no_stop_words)
  paragraphs = split_paragraphs(text_no_stop_words)
  sentences = []
  
  for p in paragraphs:
    print("paragraph", p)
    if len(p) > 100:
      sentences.extend(group_sentences_semantically(split_sentences(p), 0.15))
    else:
      sentences.append(p)
  # return group_sentences_semantically(sentences, 0.15)
  return sentences

# Text Translation 

In [3]:
def translate_es_en(spanish_text):
    device = 'cuda'
    # Load the pre-trained model and tokenizer
    model_name = "Helsinki-NLP/opus-mt-es-en"  # Spanish to English translation model
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)


    # Tokenize the Spanish text
    spanish_text_split = [spanish_text[i:i + 300] for i in range(0, len(spanish_text), 300)]
    
    translated_text = []
    for text in spanish_text_split:
        inputs = tokenizer.encode(text, return_tensors="pt")
        translated_ids = model.generate(inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
        translated_text.append(tokenizer.decode(translated_ids[0], skip_special_tokens=True))
    
    return " ".join(translated_text)

def translate_en_es(english_text):
    # Load the pre-trained model and tokenizer
    model_name = "Helsinki-NLP/opus-mt-en-es"  # English to Spanish translation model
    model = MarianMTModel.from_pretrained(model_name).to(device)
    tokenizer = MarianTokenizer.from_pretrained(model_name)

    # Tokenize the English text
    inputs = tokenizer.encode(english_text, return_tensors="pt").to(device)

    # Generate translation
    translated_ids = model.generate(inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    
    return translated_text


# Topic Model


def detect_topics_all(input):
    mname = "cristian-popa/bart-tl-all"
    tokenizer = AutoTokenizer.from_pretrained(mname)
    model = AutoModelForSeq2SeqLM.from_pretrained(mname).to(device)

    # input = "site web google search website online internet social content user"
    enc = tokenizer(input, return_tensors="pt", truncation=True, padding="max_length", max_length=1024).to(device)
    outputs = model.generate(
        input_ids=enc.input_ids,
        attention_mask=enc.attention_mask,
        max_length=25,
        min_length=1,
        do_sample=False,
        num_beams=25,
        length_penalty=1.0,
        repetition_penalty=1.5
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("Topic:", decoded)
    return decoded

def detect_topics_ng(input):
    mname = "cristian-popa/bart-tl-ng"
    tokenizer = AutoTokenizer.from_pretrained(mname)
    model = AutoModelForSeq2SeqLM.from_pretrained(mname).to(device)

    # input = "site web google search website online internet social content user"
    enc = tokenizer(input, return_tensors="pt", truncation=True, padding="max_length", max_length=1024).to(device)
    outputs = model.generate(
        input_ids=enc.input_ids,
        attention_mask=enc.attention_mask,
        max_length=25,
        min_length=1,
        do_sample=False,
        num_beams=25,
        length_penalty=1.0,
        repetition_penalty=1.5
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("Topic:", decoded)
    return decoded

# Load Documents


In [4]:
def get_text():
    documents = SimpleDirectoryReader(input_dir='documents', recursive=True).load_data()
    
    for i in documents:
        doc = nlp_es(i.text)
        text = " ".join([token.text for token in doc])
        # print(text)
        yield translate_es_en(text), i.hash
        
    del documents

# Summarization

In [5]:
text = []
import os
import json

it = 0
for t, id in get_text():    
    if not os.path.exists(f"processed_documents/{it}_{id}.json"):
        
        text = split_text(t)
        with open(f"processed_documents/{it}_{id}.json", "w") as f:
            json.dump(text, f)
            it += 1
    text = []

del nlp

# FALCON LLM summary


In [None]:
# from gpt4all import GPT4All

#     # model_name='ggml-model-gpt4all-falcon-q4_0.bin',
# model = GPT4All(
#     model_name='llama-2-7b-chat.ggmlv3.q4_0.bin',
#     model_path='./models'
#     )
# for t in text:
#     prompt = f"summarize in one sentence: {t}"
#     print(model.generate(prompt))

In [None]:
# model_name = "csebuetnlp/mT5_multilingual_XLSum"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
# summary_text = []

# src_files = []
# for file in os.listdir("processed_documents"):
#     if file.endswith(".json"):
#         with open(os.path.join("processed_documents", file), "r") as f:
#             src_text = json.load(f)

#             for sentence in src_text:
                
#                 inputs = tokenizer.encode("summarize: " + sentence, return_tensors="pt", max_length=512).to(device)
#                 outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
#                 summary = tokenizer.decode(outputs[0])
#                 summary_text.append(summary)

# print("\n\nSummarized text :\n",summary_text)
# del model
# del tokenizer

## Pegasus xsum

src_text = split_text(get_text())

del nlp

model_name = "google/pegasus-xsum"
device = "cpu"

print("summarizing")
# device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

print("generating")

batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device)
translated = model.generate(**batch)

print("decoding")

del model

tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
print(tgt_text)

In [None]:
# # model_name = "google/pegasus-xsum"
# model_name = "facebook/bart-large-cnn"

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# from transformers import BertTokenizer
# from summarizer import Summarizer

# # model_name = 'bert-large-uncased'
# # tokenizer = BertTokenizer.from_pretrained(model_name)
# # summarizer = Summarizer(model=model_name)

# # tokenizer = AutoTokenizer.from_pretrained(model_name)
# # Load tokenizer and model
# # tokenizer = AutoTokenizer.from_pretrained("your-transformer-model-name")
# # model = AutoModel.from_pretrained("your-transformer-model-name")
# device = 'cuda'
# # Define maximum sequence length and batch size
# # max_seq_length = 170 0 # Adjust as needed
# # max_sum_length = 10
# # Tokenize the text¨

# summary_text = []
# pending_text = ""

# src_files = []
# for file in os.listdir("processed_documents"):
#     if file.endswith(".json"):
#         with open(os.path.join("processed_documents", file), "r") as f:
#             print(file)
#             src_text = json.load(f)

#             for sentence in src_text:
                
#                 # print("TEXT", i)
#                 if isinstance(sentence, list):
#                     es_text = " ".join(sentence)
#                 else:
#                     es_text = str(sentence)
                    
#                 en_text = translate_es_en(es_text)
#                 # print("EN TEXT", en_text)
#                 pending_text += en_text
                
#                 text_to_summarize = pending_text
                
#                 max_summary_len = len(text_to_summarize.split(" "))
#                 if max_summary_len < 10:
#                     print("Short Sentence: ", es_text)
                
#                 else:
#                     min_summary_len = min(max(int(max_summary_len/2), 1), 30)
                    
#                     print(min_summary_len, max_summary_len)    
#                     summary = summarizer(text_to_summarize, max_length=max_summary_len, min_length=min_summary_len)[0]["summary_text"]

#                     es_text = translate_en_es(summary)
#                     print("SUMMARY: ", es_text)
                
#                 pending_text = ""


https://github.com/normal-computing/outlines

# GPT 2 Summarizer

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")

# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained("gpt2-large")

# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()

# If you have a GPU, put everything on cuda
model.to(device)

# Summarize text
def summarize_text_gpt2(text):
    # Encode a text inputs
    indexed_tokens = tokenizer.encode(text)

    # Convert indexed tokens in a PyTorch tensor
    tokens_tensor = torch.tensor([indexed_tokens])

    # If you have a GPU, put everything on cuda
    tokens_tensor = tokens_tensor.to(device)

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    # Get the predicted next sub-word
    predicted_index = torch.argmax(predictions[0, -1, :]).item()
    predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

    return predicted_text


ConnectionError: HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /gpt2-large/resolve/main/tf_model.h5 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fb77960cad0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

In [None]:
documents = ["Document 1...", "Document 2...", ...]
input_texts = ["Summarize: " + doc for doc in documents]
inputs = tokenizer.batch_encode_plus(input_texts, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = model.generate(inputs["input_ids"], max_length=max_summary_length, num_return_sequences=1, no_repeat_ngram_size=2)

for i, summary_id in enumerate(summary_ids):
    summary = tokenizer.decode(summary_id, skip_special_tokens=True)
    print(f"Summary {i+1}: {summary}")
# Remember that GPT-2 may not always produce perfectly coherent or accurate summaries, especially for complex documents. It's a good idea to experiment with different parameters and possibly fine-tuning to achieve the best results for your specific use case.





