In [13]:
!pip install transformers torch sentence-transformers pdfplumber python-docx spacy yake langchain openai chromadb
!python -m spacy download en_core_web_sm


Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.13.0-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.30.0-py3

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [17]:
!pip install langchain_google_genai google-generativeai

import google.generativeai as genai
genai.configure(api_key="AIzaSyAhbxTrTv46RX6eJGe9YTBAsF95Xcl_5Yk")  # 🔄 Add your Gemini API Key




In [1]:
import pdfplumber
import docx
import spacy
import yake
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartForConditionalGeneration, BartTokenizer, PegasusForConditionalGeneration, PegasusTokenizer, AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from google.colab import files
from sentence_transformers import SentenceTransformer, util
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

# Load spaCy model
nlp = spacy.load("en_core_web_sm")


In [2]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text + "\n"
    return text, len(pdf.pages)  # Return text & page count

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs]), None

def get_text_from_file():
    uploaded = files.upload()
    for filename in uploaded.keys():
        ext = filename.split(".")[-1]
        if ext == "pdf":
            return extract_text_from_pdf(filename)
        elif ext == "docx":
            return extract_text_from_docx(filename)
        elif ext == "txt":
            with open(filename, "r", encoding="utf-8") as file:
                return file.read(), None
        else:
            return "Unsupported file format!", None


In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

def prepare_vector_store(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_text(text)

    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vector_store = Chroma.from_texts(chunks, embeddings)

    return vector_store


In [4]:
def extractive_summary(text, num_sentences=5):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences[:num_sentences]


In [5]:
def extract_keywords(text, num_keywords=5):
    kw_extractor = yake.KeywordExtractor(top=num_keywords, stopwords=None)
    keywords = kw_extractor.extract_keywords(text)
    return [kw[0] for kw in keywords]


In [6]:
def classify_document(text):
    categories = {
        "legal": ["court", "law", "agreement", "contract", "policy"],
        "technical": ["AI", "algorithm", "data", "software", "engineering"],
        "academic": ["research", "study", "university", "experiment", "paper"],
        "general": ["news", "blog", "report", "story", "review"]
    }
    doc = nlp(text.lower())
    word_counts = {category: sum(1 for token in doc if token.text in words) for category, words in categories.items()}
    return max(word_counts, key=word_counts.get).capitalize()


In [7]:
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

qa_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
qa_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [8]:
def abstractive_summary(text, model, tokenizer, max_length=150):
    input_text = "summarize: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    output_ids = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [9]:
def single_sentence_explanation(text):
    input_text = "summarize: " + text
    input_ids = bart_tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    output_ids = bart_model.generate(input_ids, max_length=30, num_beams=5, early_stopping=True)
    return bart_tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [10]:
def generate_summary(text, num_extractive=5):
    key_points = extractive_summary(text, num_extractive)
    extracted_text = " ".join(key_points)

    t5_summary = abstractive_summary(extracted_text, t5_model, t5_tokenizer)
    pegasus_summary = abstractive_summary(extracted_text, pegasus_model, pegasus_tokenizer)

    final_paragraph_summary = t5_summary + " " + pegasus_summary
    explanation = single_sentence_explanation(extracted_text)
    keywords = extract_keywords(text, num_keywords=5)
    document_type = classify_document(text)

    return key_points, final_paragraph_summary, explanation, keywords, document_type


In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

def answer_question_langchain(question, vector_store):
    llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        google_api_key="GEMINI API"  # ✅ Direct API key
    )

    retriever = vector_store.as_retriever(search_kwargs={"k": 1})  # ✅ Optimized for efficiency

    # ✅ Improved prompt for better context
    prompt_template = PromptTemplate(
        template="You are an AI assistant analyzing a document. Use the given document context to answer accurately.\n\n{context}\n\nQuestion: {question}\n\nAnswer:",
        input_variables=["context", "question"],
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        chain_type_kwargs={"prompt": prompt_template}
    )

    return qa_chain.invoke({"query": question})["result"]  # ✅ Extracts only the result


In [12]:
choice = input("Choose input method: \n1. Enter text manually\n2. Upload a document (PDF, DOCX, TXT)\nEnter 1 or 2: ")

text, page_count = "", None

if choice == "1":
    text = input("Enter the text to summarize:\n")

elif choice == "2":
    text, page_count = get_text_from_file()

if text:
    key_points, paragraph_summary, explanation, keywords, document_type = generate_summary(text)

    print("\nDocument Type: ", document_type)
    print("\nKeywords: ", ", ".join(keywords))
    print("\nKeypoints of this document:\n")
    for idx, sentence in enumerate(key_points, 1):
        print(f"{idx}. {sentence}")

    print("\nSummary of what is written in the document:\n")
    print(paragraph_summary)

    print("\nExplanation in a single sentence:\n")
    print(explanation)


Choose input method: 
1. Enter text manually
2. Upload a document (PDF, DOCX, TXT)
Enter 1 or 2: 2


Saving AI art in architecture.pdf to AI art in architecture (1).pdf


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Document Type:  Technical

Keywords:  Stable Diffusion, cid, Image, Diffusion models, Diffusion

Keypoints of this document:

1. Ploennigs and Berger AI in Civil Engineering (2023) 2:8 AI in Civil Engineering
https://doi.org/10.1007/s43503-023-00018-y
ORIGINAL ARTICLE Open Access
AI art in architecture
Joern Ploennigs1* and Markus Berger1
Abstract
Recent diffusion-based AI art platforms can create impressive images from simple text descriptions.
2. This makes them
powerful tools for concept design in any discipline that requires creativity in visual design tasks.
3. This is also true
for early stages of architectural design with multiple stages of ideation, sketching and modelling.
4. In this paper, we
investigate how applicable diffusion-based models already are to these tasks.
5. We research the applicability of the plat-
·
forms Midjourney, DALL E 2 and Stable Diffusion to a series of common use cases in architectural design to determine
which are already solvable or might soon be.

In [36]:
import time

vector_store = prepare_vector_store(text)

while True:
    user_question = input("\nAsk a question about the document (or type 'exit' to stop): ")
    if user_question.lower() == "exit":
        break

    retries = 3  # Maximum retries for 429 errors
    delay = 10   # Initial delay for rate limiting

    while retries > 0:
        try:
            response = answer_question_langchain(user_question, vector_store)
            print("\nAnswer:", response)  # ✅ Extracts only the relevant result
            break  # ✅ Break loop if successful

        except Exception as e:
            print("\n⚠️ Error:", str(e))
            if "429" in str(e):  # ✅ Handles quota exhaustion
                print(f"\n🔄 Waiting for {delay} seconds due to quota limit...")
                time.sleep(delay)
                delay *= 2  # ✅ Exponential backoff to avoid repeated failures
                retries -= 1
            else:
                break  # Other errors should not retry



Ask a question about the document (or type 'exit' to stop): summary of the page 2

Answer: Page 2's analysis shows that Midjourney users don't use full sentences in their prompts.  Instead, they use a probabilistic collection of terms related to content, style, or render quality.  These terms create interesting links between seemingly disparate concepts like interior design, floor plan drawing, architectural visualization, gothic cathedrals, and swimming pools.  The analysis (visualized in Figure 2) allows for the creation of an autocomplete function.

Ask a question about the document (or type 'exit' to stop): exit
