In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from pypdf import PdfReader
from langchain.document_loaders import PyPDFLoader
from langchain.llms import OpenAI
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [4]:
import os
import json
import pandas as pd
import glob
from openai import OpenAI

In [5]:
#Get all PDFs inside the tsmc folder
pdf_files = glob.glob("./tsmc/*.pdf")

In [7]:
#Load & split documents from all PDFs
all_docs = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

for pdf in pdf_files:
    loader = PyPDFLoader(pdf)
    data = loader.load()
    texts = text_splitter.split_documents(data)
    all_docs.extend(texts)

In [11]:
#Use Ollama embeddings with a supported model
embeddings = OllamaEmbeddings(model="mxbai-embed-large:335m")

In [12]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore.from_documents(all_docs, embeddings)

In [8]:
print(f"✅ Total PDFs: {len(pdf_files)}")
print(f"✅ Total chunks stored: {len(all_docs)}")

✅ Total PDFs: 18
✅ Total chunks stored: 22476


In [9]:
first_chunk = all_docs[102]
print("=== Page Content ===")
print(first_chunk.page_content)
print("=== Metadata ===")
print(first_chunk.metadata)

=== Page Content ===
4.4 Compensation Committee
The Compensation Committee assists the Board in discharging 
its responsibilities related to TSMC’s compensation and benefits 
policies, plans and programs, and in the evaluation and compensation 
of TSMC’s executives. The Committee meets at least four times a 
year.
The Compensation Committee was comprised of four members. 
All three independent directors served as voting members of the 
Committee and the Chairman of the Board, Dr. Morris Chang,
=== Metadata ===
{'producer': 'Adobe Acrobat 8.0', 'creator': 'Adobe Acrobat 8.0 Combine Files', 'creationdate': '2011-12-07T17:43:42+08:00', 'moddate': '2011-12-07T17:48:52+08:00', 'source': './tsmc/TSMC CSR 2010.pdf', 'total_pages': 84, 'page': 15, 'page_label': '16'}


In [None]:
import ollama
from langchain_community.llms import Ollama

In [16]:
llm = Ollama(model='gpt-oss:120b')

In [None]:
# SAH:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    # The language model that will generate the final answer
    # (e.g. GPT, Ollama model, etc.)
    retriever=vector_store.as_retriever(),
    # The retriever defines how to look up relevant chunks of text
    # from your vector store. It embeds the question, finds similar
    # embeddings in the store, and passes those chunks to the LLM.
    return_source_documents=True
    # If True, the chain will return not just the answer, but also
    # the source documents (the retrieved chunks) that were fed
    # into the LLM. Useful for transparency & debugging.
)

query = "Tell the key points about TSMC's 3nm technology from the documents."
result = qa_chain({"query": query})
print(result["result"])