In [None]:
!pip install transformers torch sentence-transformers pdfplumber python-docx spacy yake
!python -m spacy download en_core_web_sm


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting 

In [None]:
import pdfplumber
import docx
import spacy
import yake
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartForConditionalGeneration, BartTokenizer, PegasusForConditionalGeneration, PegasusTokenizer, AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from google.colab import files
from sentence_transformers import SentenceTransformer, util

# Load spaCy model
nlp = spacy.load("en_core_web_sm")


In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text + "\n"
    return text, len(pdf.pages)  # Return text & page count

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs]), None

def get_text_from_file():
    uploaded = files.upload()
    for filename in uploaded.keys():
        ext = filename.split(".")[-1]
        if ext == "pdf":
            return extract_text_from_pdf(filename)
        elif ext == "docx":
            return extract_text_from_docx(filename)
        elif ext == "txt":
            with open(filename, "r", encoding="utf-8") as file:
                return file.read(), None
        else:
            return "Unsupported file format!", None


In [None]:
def extractive_summary(text, num_sentences=5):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences[:num_sentences]


In [None]:
def extract_keywords(text, num_keywords=5):
    kw_extractor = yake.KeywordExtractor(top=num_keywords, stopwords=None)
    keywords = kw_extractor.extract_keywords(text)
    return [kw[0] for kw in keywords]


In [None]:
def classify_document(text):
    categories = {
        "legal": ["court", "law", "agreement", "contract", "policy"],
        "technical": ["AI", "algorithm", "data", "software", "engineering"],
        "academic": ["research", "study", "university", "experiment", "paper"],
        "general": ["news", "blog", "report", "story", "review"]
    }
    doc = nlp(text.lower())
    word_counts = {category: sum(1 for token in doc if token.text in words) for category, words in categories.items()}
    return max(word_counts, key=word_counts.get).capitalize()


In [None]:
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

qa_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
qa_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are 

In [None]:
def abstractive_summary(text, model, tokenizer, max_length=150):
    input_text = "summarize: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    output_ids = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [None]:
def single_sentence_explanation(text):
    input_text = "summarize: " + text
    input_ids = bart_tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    output_ids = bart_model.generate(input_ids, max_length=30, num_beams=5, early_stopping=True)
    return bart_tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [None]:
def generate_summary(text, num_extractive=5):
    key_points = extractive_summary(text, num_extractive)
    extracted_text = " ".join(key_points)

    t5_summary = abstractive_summary(extracted_text, t5_model, t5_tokenizer)
    pegasus_summary = abstractive_summary(extracted_text, pegasus_model, pegasus_tokenizer)

    final_paragraph_summary = t5_summary + " " + pegasus_summary
    explanation = single_sentence_explanation(extracted_text)
    keywords = extract_keywords(text, num_keywords=5)
    document_type = classify_document(text)

    return key_points, final_paragraph_summary, explanation, keywords, document_type


In [None]:
choice = input("Choose input method: \n1. Enter text manually\n2. Upload a document (PDF, DOCX, TXT)\nEnter 1 or 2: ")

text, page_count = "", None

if choice == "1":
    text = input("Enter the text to summarize:\n")

elif choice == "2":
    text, page_count = get_text_from_file()

if text:
    key_points, paragraph_summary, explanation, keywords, document_type = generate_summary(text)

    print("\nDocument Type: ", document_type)
    print("\nKeywords: ", ", ".join(keywords))
    print("\nKeypoints of this document:\n")
    for idx, sentence in enumerate(key_points, 1):
        print(f"{idx}. {sentence}")

    print("\nSummary of what is written in the document:\n")
    print(paragraph_summary)

    print("\nExplanation in a single sentence:\n")
    print(explanation)


Choose input method: 
1. Enter text manually
2. Upload a document (PDF, DOCX, TXT)
Enter 1 or 2: 2


Saving AI art in architecture.pdf to AI art in architecture (1).pdf


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Document Type:  Technical

Keywords:  Stable Diffusion, cid, Image, Diffusion models, Diffusion

Keypoints of this document:

1. Ploennigs and Berger AI in Civil Engineering (2023) 2:8 AI in Civil Engineering
https://doi.org/10.1007/s43503-023-00018-y
ORIGINAL ARTICLE Open Access
AI art in architecture
Joern Ploennigs1* and Markus Berger1
Abstract
Recent diffusion-based AI art platforms can create impressive images from simple text descriptions.
2. This makes them
powerful tools for concept design in any discipline that requires creativity in visual design tasks.
3. This is also true
for early stages of architectural design with multiple stages of ideation, sketching and modelling.
4. In this paper, we
investigate how applicable diffusion-based models already are to these tasks.
5. We research the applicability of the plat-
·
forms Midjourney, DALL E 2 and Stable Diffusion to a series of common use cases in architectural design to determine
which are already solvable or might soon be.

In [None]:
def extract_number_from_text(text):
    numbers = re.findall(r'\b\d+\b', text)
    return numbers[0] if numbers else None


In [None]:
def answer_question(question, text, page_count=None):
    if "pages" in question.lower() and page_count is not None:
        return f"The document has {page_count} pages."

    doc_sentences = [sent.text.strip() for sent in nlp(text).sents]
    doc_embeddings = sentence_model.encode(doc_sentences, convert_to_tensor=True)
    query_embedding = sentence_model.encode(question, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(query_embedding, doc_embeddings)[0]
    best_sentence = doc_sentences[scores.argmax()]

    extracted_number = extract_number_from_text(best_sentence)
    if extracted_number:
        return f"The answer is {extracted_number}."

    input_text = f"Question: {question} Answer using: {best_sentence}"
    input_ids = qa_tokenizer.encode(input_text, return_tensors="pt")
    output_ids = qa_model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)

    return qa_tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [None]:
while True:
    user_question = input("\nAsk a question about the document (or type 'exit' to stop): ")
    if user_question.lower() == "exit":
        break
    answer = answer_question(user_question, text, page_count)
    print("\nAnswer:", answer)



Ask a question about the document (or type 'exit' to stop): number of pages in the document 

Answer: The document has 11 pages.

Ask a question about the document (or type 'exit' to stop): explain the abstract of the document 

Answer: the abstract of the document

Ask a question about the document (or type 'exit' to stop): print the content of the page number 1

Answer: The answer is 2022.

Ask a question about the document (or type 'exit' to stop): exit
