In [1]:
!pip install langchain chromadb tiktoken



In [2]:
# Step 2: Imports
import os
import shutil
import requests
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings

In [3]:
with open("transportation_test_text.txt", "r", encoding="utf-8") as f:
    text = f.read()


In [4]:
# Step 4: Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)
chunks = text_splitter.split_text(text)
documents = [Document(page_content=chunk) for chunk in chunks]

In [5]:
chunks

['The invention of the wheel revolutionized human movement. American whole magazine truth stop whose. On traditional measure example sense peace. Would mouth relate own chair. Role together range line. Government first policy daughter.',
 'Governments worldwide are investing in infrastructure to support new transport methods. Kind miss artist truth trouble behavior style. Ability management test during foot that course nothing. Sound central myself before year. Your majority feeling fact by four two. White owner onto knowledge other. First drug contain start almost wonder.',
 'Today, electric vehicles and autonomous cars represent a new era of mobility. Attorney quickly candidate change although bag record. Raise study modern miss dog Democrat quickly. Often late produce you true soldier. Food break onto friend.\n\nHigh-speed trains and subways reshaped urban transportation. Administration even relate head color. Staff beyond chair recently and off. Own available buy country store buil

In [6]:
# Step 5: Create custom embedding class using requests
class CustomEmbedding(Embeddings):
    def embed_documents(self, texts):
        url = "http://localhost:11434/v1/embeddings"
        headers = {"Content-Type": "application/json"}
        data = {
            "model": "mxbai-embed-large:latest",
            "input": texts
        }
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        return [d["embedding"] for d in response.json()["data"]]
    
    def embed_query(self, text):
        return self.embed_documents([text])[0]

embedding_model = CustomEmbedding()

In [7]:
# Step 6: Store in Chroma DB
persist_directory = "chroma_db"


In [8]:
print(Embeddings)

<class 'langchain_core.embeddings.embeddings.Embeddings'>


In [9]:
# Step 7: Create Chroma vector store and persist
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    persist_directory=persist_directory
)

vectorstore.persist()
print("✅ Embeddings stored successfully in Chroma DB.")

✅ Embeddings stored successfully in Chroma DB.


  vectorstore.persist()


In [10]:
# Explicitly request embeddings, metadata, and documents
results = vectorstore.get(include=["embeddings", "metadatas", "documents"])

# Print each chunk and its embedding
for i, (embedding, metadata, document) in enumerate(zip(results["embeddings"], results["metadatas"], results["documents"])):
    print(f"\n🔹 Chunk {i+1}:")
    print(f"Text: {document[:100]}...")  # Print first 100 characters of the chunk
    print(f"Embedding: {embedding[:5]}...")  # Print first 5 dimensions for brevity



🔹 Chunk 1:
Text: The invention of the wheel revolutionized human movement. American whole magazine truth stop whose. ...
Embedding: [ 0.00135405 -0.00638477  0.02167933  0.0439051   0.01640437]...

🔹 Chunk 2:
Text: Governments worldwide are investing in infrastructure to support new transport methods. Kind miss ar...
Embedding: [ 0.01892734 -0.01751047  0.02744835  0.00064038  0.00643887]...

🔹 Chunk 3:
Text: Today, electric vehicles and autonomous cars represent a new era of mobility. Attorney quickly candi...
Embedding: [ 0.01401016  0.00997913 -0.00108978  0.02315456  0.00525275]...

🔹 Chunk 4:
Text: Futuristic ideas such as hyperloops, flying cars, and space travel dominate discussions. Decision so...
Embedding: [ 0.02067159  0.00056219 -0.00738473  0.00110476  0.00992023]...

🔹 Chunk 5:
Text: High-speed trains and subways reshaped urban transportation. Exactly drive well good explain grow wa...
Embedding: [ 0.01034095  0.00454821 -0.00739463  0.00665863 -0.01190622]...

🔹 Chunk 6

In [11]:
# Step 6: Store in Chroma DB
persist_directory = "chroma_db"

# Step 7: Create Chroma vector store and persist
try:
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_directory
    )
    vectorstore.persist()
    print("✅ Embeddings stored successfully in Chroma DB.")
except Exception as e:
    print(f"Error storing embeddings: {e}")
    import sys
    sys.exit(1)

# Step 8: Explicitly request embeddings, metadata, and documents
results = vectorstore.get(include=["embeddings", "metadatas", "documents"])
print(f"Number of embeddings: {len(results['embeddings'])}")

# Step 9: Print each chunk and its embedding
for i, (embedding, metadata, document) in enumerate(zip(results["embeddings"], results["metadatas"], results["documents"])):
    print(f"\n🔹 Chunk {i+1}:")
    print(f"Text: {document[:100]}...")  # First 100 characters
    print(f"Embedding: {embedding[:5]}...")  # First 5 dimensions

✅ Embeddings stored successfully in Chroma DB.
Number of embeddings: 1880

🔹 Chunk 1:
Text: The invention of the wheel revolutionized human movement. American whole magazine truth stop whose. ...
Embedding: [ 0.00135405 -0.00638477  0.02167933  0.0439051   0.01640437]...

🔹 Chunk 2:
Text: Governments worldwide are investing in infrastructure to support new transport methods. Kind miss ar...
Embedding: [ 0.01892734 -0.01751047  0.02744835  0.00064038  0.00643887]...

🔹 Chunk 3:
Text: Today, electric vehicles and autonomous cars represent a new era of mobility. Attorney quickly candi...
Embedding: [ 0.01401016  0.00997913 -0.00108978  0.02315456  0.00525275]...

🔹 Chunk 4:
Text: Futuristic ideas such as hyperloops, flying cars, and space travel dominate discussions. Decision so...
Embedding: [ 0.02067159  0.00056219 -0.00738473  0.00110476  0.00992023]...

🔹 Chunk 5:
Text: High-speed trains and subways reshaped urban transportation. Exactly drive well good explain grow wa...
Embedding: [

In [12]:
# Step 10: Process user query for keyword and vector search
query = "transportation innovations"  # Example query; replace with user input

# Keyword search on Chroma DB
try:
    keyword_results = vectorstore.similarity_search(query, k=5)
    print(f"\nKeyword search results for '{query}':")
    for i, result in enumerate(keyword_results):
        print(f"Result {i+1}:")
        print(f"Content: {result.page_content[:100]}...")
        print(f"Metadata: {result.metadata}")
except Exception as e:
    print(f"Error in keyword search: {e}")
    keyword_results = []


Keyword search results for 'transportation innovations':
Result 1:
Content: Governments worldwide are investing in infrastructure to support new transport methods. Face ready i...
Metadata: {}
Result 2:
Content: Governments worldwide are investing in infrastructure to support new transport methods. Face ready i...
Metadata: {}
Result 3:
Content: Governments worldwide are investing in infrastructure to support new transport methods. Face ready i...
Metadata: {}
Result 4:
Content: Governments worldwide are investing in infrastructure to support new transport methods. Face ready i...
Metadata: {}
Result 5:
Content: Governments worldwide are investing in infrastructure to support new transport methods. Face ready i...
Metadata: {}


In [13]:
# Step 11: Embed the query
try:
    embedded_query = embedding_model.embed_query(query)
    print(f"\nEmbedded query (first 5 dimensions): {embedded_query[:5]}...")
except Exception as e:
    print(f"Error embedding query: {e}")
    embedded_query = [0.0] * 1024  # Fallback: zero vector


Embedded query (first 5 dimensions): [0.009404521, 0.009462016, 0.0059296805, 4.9838603e-05, 0.0012932946]...


In [14]:
# Step 12: Vector search using embedded query
try:
    vector_results = vectorstore.similarity_search_by_vector(embedded_query, k=2)
    print(f"\nVector search results for '{query}':")
    for i, result in enumerate(vector_results):
        print(f"Result {i+1}:")
        print(f"Content: {result.page_content[:100]}...")
        print(f"Metadata: {result.metadata}")
except Exception as e:
    print(f"Error in vector search: {e}")
    vector_results = []


Vector search results for 'transportation innovations':
Result 1:
Content: Governments worldwide are investing in infrastructure to support new transport methods. Face ready i...
Metadata: {}
Result 2:
Content: Governments worldwide are investing in infrastructure to support new transport methods. Face ready i...
Metadata: {}


In [15]:
# Step 13: Combine and deduplicate chunks
combined_chunks = []
chunk_texts = set()  # For deduplication
for result in keyword_results + vector_results:
    chunk_text = result.page_content
    if chunk_text not in chunk_texts:
        combined_chunks.append(chunk_text)
        chunk_texts.add(chunk_text)
print(f"\nCombined unique chunks: {len(combined_chunks)}")


Combined unique chunks: 1


In [16]:

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Step 14.1: Deduplicate chunks
unique_chunks = list(set(combined_chunks))

# Step 14.2: TF-IDF re-ranking based on cosine similarity
vectorizer = TfidfVectorizer().fit([query] + combined_chunks)
query_vec = vectorizer.transform([query])
chunk_vecs = vectorizer.transform(combined_chunks)
similarities = cosine_similarity(query_vec, chunk_vecs).flatten()

# Step 14.3: Print similarity scores for all chunks
print("🔎 Cosine Similarity Scores:")
for i, (chunk, score) in enumerate(zip(unique_chunks, similarities)):
    preview = chunk[:100].replace("\n", " ") + ("..." if len(chunk) > 100 else "")
    print(f"[{i}] Score: {score:.4f} | Chunk: {preview}")


# Step 14.4: Rank chunks by similarity and select top-k (e.g., top 5)
top_k = 5
top_indices = np.argsort(similarities)[::-1][:top_k]
reranked_chunks = [combined_chunks[i] for i in top_indices]

# Step 14.4: Print top-k chunks and their scores
print(f"\n📌 Top-{top_k} Most Relevant Chunks:")
for rank, idx in enumerate(top_indices, start=1):
    score = similarities[idx]
    chunk = unique_chunks[idx]
    preview = chunk[:300].replace("\n", " ") + ("..." if len(chunk) > 300 else "")
    print(f"\n#{rank} | Score: {score:.4f}\n{preview}")

# Step 14.5: Optional prompt compression: truncate each chunk to 300 characters
compressed_chunks = [chunk[:300] for chunk in reranked_chunks]

# Step 14.6: Rebuild final context for LLM
context = "\n\n".join(compressed_chunks)
llm_input = f"Query: {query}\n\nContext:\n{context}\n\nAnswer:"
print("Re-ranked and compressed context prepared for LLM.")


🔎 Cosine Similarity Scores:
[0] Score: 0.0000 | Chunk: Governments worldwide are investing in infrastructure to support new transport methods. Face ready i...

📌 Top-5 Most Relevant Chunks:

#1 | Score: 0.0000
Governments worldwide are investing in infrastructure to support new transport methods. Face ready issue that party identify. Subject management interesting idea if finally cold. Business performance city test direction call.  The invention of the wheel revolutionized human movement. Small finish te...
Re-ranked and compressed context prepared for LLM.


In [17]:
#Step 14: Prepare context for LLM
context = "\n\n".join(combined_chunks)
llm_input = f"Query: {query}\n\nContext:\n{context}\n\nAnswer the query based on the provided context."

In [18]:
# Step 15: Call LLM (llama3.2-vision:11b) via Ollama API
llm_url = "http://localhost:11434/v1/chat/completions"
llm_headers = {"Content-Type": "application/json"}
llm_data = {
    "model": "llama3.2-vision:11b",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant. Answer the query using the provided context."},
        {"role": "user", "content": llm_input}
    ],
    "max_tokens": 500
}
try:
    response = requests.post(llm_url, headers=llm_headers, json=llm_data)
    response.raise_for_status()
    llm_response = response.json()["choices"][0]["message"]["content"]
    print(f"\nLLM Response for query '{query}':")
    print(llm_response)
except requests.RequestException as e:
    print(f"Error calling LLM: {e}")
    llm_response = "Failed to generate response."


Error calling LLM: 500 Server Error: Internal Server Error for url: http://localhost:11434/v1/chat/completions
