In [1]:
!pip install langchain langchain-core langchain-openai langchain_community
!pip install docx2txt pypdf unstructured sentence_transformers langchain_chroma

Collecting langchain-openai
  Downloading langchain_openai-1.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain_community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
INFO: pip is looking at multiple versions of langchain-openai to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-openai
  Downloading langchain_openai-1.0.3-py3-none-any.whl.metadata (2.6 kB)
  Downloading langchain_openai-1.0.2-py3-none-any.whl.metadata (1.8 kB)
  Downloading langchain_openai-1.0.1-py3-none-any.whl.metadata (1.8 kB)
  Downloading langchain_openai-1.0.0-py3-none-any.whl.metadata (1.8 kB)
  Downloading langchain_openai-0.3.35-py3-none-any.whl.metadata (2.4 kB)
INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain_community
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

print("Loaded:", api_key is not None)

In [4]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [None]:
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

In [None]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document
import os

def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        else:
            print(f"Unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
    return documents

folder_path = "/content/docs"
documents = load_documents(folder_path)
print(f"Loaded {len(documents)} documents from the folder.")

Loaded 2 documents from the folder.


In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

splits = text_splitter.split_documents(documents)
print(f"Split the documents into {len(splits)} chunks.")

Split the documents into 4 chunks.


In [None]:
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
document_embeddings = embedding_function.embed_documents([split.page_content for split in splits])
print(document_embeddings[:][:])  # Printing first 5 elements of the first embedding

  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[[0.03779206797480583, 0.044629067182540894, -0.005410768557339907, 0.058677855879068375, -0.0012764562852680683, 0.047622378915548325, -0.023229219019412994, 0.06790649890899658, -0.01884857937693596, 0.031792931258678436, 0.04777897521853447, -0.059038564562797546, 0.028084784746170044, -0.011974037624895573, 0.020779810845851898, -0.0274360328912735, -0.039807386696338654, -0.06913729012012482, -0.018641579896211624, -0.07042217999696732, 0.04751511290669441, 0.0018589104292914271, 0.08070768415927887, 0.036426205188035965, -0.12356121838092804, 0.07355396449565887, -0.006882114801555872, -0.010735769756138325, -0.043967653065919876, -0.046008169651031494, 0.0031541013158857822, -0.021798165515065193, 0.07870964705944061, 0.05358723923563957, -0.05331822484731674, 0.0029734051786363125, -0.037264447659254074, 0.00869439821690321, 0.01878541335463524, 0.0001410669065080583, -0.05955333635210991, -0.07246547937393188, 0.058996155858039856, -0.007093359250575304, 0.05234265327453613, -

In [None]:
from langchain_chroma import Chroma

collection_name = "my_collection"
vectorstore = Chroma.from_documents(
    collection_name=collection_name,
    documents=splits,
    embedding=embedding_function,
    persist_directory="./chroma_db"
)
print("Vector store created and persisted to './chroma_db'")

Vector store created and persisted to './chroma_db'


In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
retriever_results = retriever.invoke("what shoule be included in the report?")
print(retriever_results)


[Document(id='158b3599-fd10-47bc-9fa3-930a434ef2a7', metadata={'total_pages': 1, 'moddate': '2024-10-10T16:05:52+05:30', 'author': 'H.M. Samadhi Chathuranga Rathnayake', 'page': 0, 'source': '/content/docs/Report Marking Scheme.pdf', 'creationdate': '2024-10-10T16:05:52+05:30', 'page_label': '1', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365'}, page_content='research problem. \n• Overall Report Quality (Presentation, Formatting, References) (5 marks) \no Well-structured report with proper formatting, grammar, and \ncitations.'), Document(id='cde0b4be-e8cf-42e4-a086-9adbbd28b572', metadata={'creationdate': '2024-10-10T16:05:52+05:30', 'total_pages': 1, 'source': '/content/docs/Report Marking Scheme.pdf', 'author': 'H.M. Samadhi Chathuranga Rathnayake', 'producer': 'Microsoft® Word for Microsoft 365', 'page_label': '1', 'creator': 'Microsoft® Word for Microsoft 365', 'page': 0, 'moddate': '2024-10-10T16:05:52+05:30'}, page_content='Marking

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

template = """Answer the question based only on the following context:
{context}
Question: {question}
Answer: """

prompt = ChatPromptTemplate.from_template(template)

def docs2str(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | docs2str, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [5]:
llm_response = llm.invoke("Tell me a joke")
llm_response

AIMessage(content='Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 11, 'total_tokens': 28, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_50906f2aac', 'id': 'chatcmpl-CgTnUQXINNfX6l7DZLwXJ3mUTjYWn', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='lc_run--e090dcc6-61af-450b-87ca-3a90029bc3cc-0', usage_metadata={'input_tokens': 11, 'output_tokens': 17, 'total_tokens': 28, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [None]:
chain = llm | output_parser
result = chain.invoke("Tell me a joke")
print(result)

Why did the scarecrow win an award?

Because he was outstanding in his field!


In [None]:
from typing import List
from pydantic import BaseModel, Field

class MobileReview(BaseModel):
    phone_model: str = Field(description="Name and model of the phone")
    rating: float = Field(description="Overall rating out of 5")
    pros: List[str] = Field(description="List of positive aspects")
    cons: List[str] = Field(description="List of negative aspects")
    summary: str = Field(description="Brief summary of the review")

review_text = """
Just got my hands on the new Galaxy S21 and wow, this thing is slick! The screen is gorgeous,
colors pop like crazy. Camera's insane too, especially at night - my Insta game's never been
stronger. Battery life's solid, lasts me all day no problem.
Not gonna lie though, it's pretty pricey. And what's with ditching the charger? C'mon Samsung.
Also, still getting used to the new button layout, keep hitting Bixby by mistake.
Overall, I'd say it's a solid 4 out of 5. Great phone, but a few annoying quirks keep it from
being perfect. If you're due for an upgrade, definitely worth checking out!
"""

structured_llm = llm.with_structured_output(MobileReview)
output = structured_llm.invoke(review_text)
print(output)
print(output.pros)


phone_model='Galaxy S21' rating=4.0 pros=['Gorgeous screen with vibrant colors', 'Insane camera performance, especially at night', 'Solid battery life, lasts all day'] cons=['High price point', 'No charger included in the box', 'New button layout takes time to get used to, accidental Bixby activations'] summary='The Galaxy S21 is a stunning phone with a fantastic display and impressive camera capabilities, making it great for social media enthusiasts. However, the lack of accessories and the price might deter some buyers.'
['Gorgeous screen with vibrant colors', 'Insane camera performance, especially at night', 'Solid battery life, lasts all day']


In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("Tell me a short joke about {topic}")
chain = prompt | llm | output_parser
result = chain.invoke({"topic": "dogs"})
print(result)


Why did the dog sit in the shade?

Because he didn't want to become a hot dog!


In [None]:
from langchain_core.messages import HumanMessage, SystemMessage

messages = [
    SystemMessage(content="You are a helpful assistant that tells jokes."),
    HumanMessage(content="Tell me about programming")
]
response = llm.invoke(messages)
print(response)

template = ChatPromptTemplate([
    ("system", "You are a helpful assistant that tells jokes."),
    ("human", "Tell me about {topic}")
])
chain = template | llm | output_parser
response = chain.invoke({"topic": "programming"})
print(response)


In [None]:
print(documents[1])

In [None]:
print(splits[3])

In [None]:
query = "what shoule be included in the report?"
search_results = vectorstore.similarity_search(query, k=2)
print(f"\nTop 2 most relevant chunks for the query: '{query}'\n")
for i, result in enumerate(search_results, 1):
    print(f"Result {i}:")
    print(f"Source: {result.metadata.get('source', 'Unknown')}")
    print(f"Content: {result.page_content}")
    print()

In [None]:
question = "what are the contents in the report?"
response = rag_chain.invoke(question)
print(f"Question: {question}")
print(f"Answer: {response}")


Question: what are the contents in the report?
Answer: The contents in the report include the following sections:

1. Title Page – Includes the study title, group members, and date.
2. Abstract – A brief summary of the study, objectives, methods, key findings, and conclusions.
3. Introduction
4. The Problem – Explanation of the issue or question.
5. Main Objective – Statement of the main aim of the study.
6. Sub-Objectives – List of specific goals.
7. Methodology
   - Target Population – Description of the population.
   - Sampling Method – Details on how the sample was selected.
   - Data Collection – Explanation of the process of gathering data.
8. Descriptive Statistics – Inclusion of all statistical summaries.
9. Data Visualization – Presentation of key visual representations of findings.
10. Hypothesis Testing – Details of conducted hypothesis tests and their outcomes.
11. Regression Analysis – Discussion of regression results and interpretations.


In [None]:
# Example conversation
from langchain_core.messages import HumanMessage, AIMessage
chat_history = []
chat_history.extend([
    HumanMessage(content=question),
    AIMessage(content=response)
])

In [None]:
from langchain_core.prompts import MessagesPlaceholder
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

# history_aware_retriever = create_history_aware_retriever(
#     llm, retriever, contextualize_q_prompt
# )
contextualize_chain = contextualize_q_prompt | llm | StrOutputParser()
contextualize_chain.invoke({"input": "what should be included in it?", "chat_history": chat_history})

'What key components should be included in a report?'

In [None]:
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage

chat_history = []

def ask_question(user_question, chat_history, llm, retriever):
    print(f"\n--- You asked: {user_question} ---")

    # Step 1: Make the question standalone using conversation history
    if chat_history:
        messages = [
            SystemMessage(content="""
Given a chat history and the latest user question
which might reference context in the chat history,
formulate a standalone question which can be understood
without the chat history. Do NOT answer the question,
just reformulate it if needed and otherwise return it as is.
"""),
        ] + chat_history + [
            HumanMessage(content=f"New question: {user_question}")
        ]

        # Use LLM to rewrite question
        rewritten = llm.invoke(messages)  # returns AIMessage
        search_question = rewritten.content.strip()
        print(f"Searching for: {search_question}")
    else:
        search_question = user_question

    # Step 2: Retrieve documents
    docs = retriever.invoke(search_question)

    # Convert list of Document objects into a single text string
    docs_text = "\n\n".join([doc.page_content for doc in docs])

    # Step 3: Generate answer from documents
    answer_msg = llm.invoke([
        SystemMessage(content=f"Answer the question based on the following context:\n{docs_text}"),
        HumanMessage(content=search_question)
    ])
    answer_text = answer_msg.content

    # Step 4: Update chat history
    chat_history.extend([
        HumanMessage(content=user_question),
        AIMessage(content=answer_text)
    ])

    return answer_text

In [None]:
# Example usage
q1 = "what are the contents in the report?"
a1 = ask_question(q1, chat_history, llm, retriever)
print(f"AI: {a1}\n")

q2 = "how are the marks allocated in it?"
a2 = ask_question(q2, chat_history, llm, retriever)
print(f"AI: {a2}")


--- You asked: what are the contents in the report? ---
AI: The contents of the report for IE2024 Probability and Statistics study should include the following sections:

1. **Title Page** – Includes study title, group members, and date.
2. **Abstract** – A brief summary of the study, objectives, methods, key findings, and conclusions.
3. **Introduction**
4. **The Problem** – Explanation of the issue or question being addressed.
5. **Main Objective** – The main aim of the study.
6. **Sub-Objectives** – Specific goals of the study.
7. **Methodology**
   - **Target Population** – Description of the population being studied.
   - **Sampling Method** – Details on how the sample was selected.
   - **Data Collection** – Explanation of the data gathering process.
8. **Descriptive Statistics** – Statistical summaries of the data.
9. **Data Visualization** – Key visual representations of the findings.
10. **Hypothesis Testing** – Details of the hypothesis tests conducted and their outcomes.
11

In [None]:
import sqlite3
from datetime import datetime
import uuid

DB_NAME = "rag_app.db"

def get_db_connection():
    conn = sqlite3.connect(DB_NAME)
    conn.row_factory = sqlite3.Row
    return conn

def create_application_logs():
    conn = get_db_connection()
    conn.execute('''CREATE TABLE IF NOT EXISTS application_logs
    (id INTEGER PRIMARY KEY AUTOINCREMENT,
    session_id TEXT,
    user_query TEXT,
    gpt_response TEXT,
    model TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
    conn.close()

def insert_application_logs(session_id, user_query, gpt_response, model):
    conn = get_db_connection()
    conn.execute('INSERT INTO application_logs (session_id, user_query, gpt_response, model) VALUES (?, ?, ?, ?)',
                 (session_id, user_query, gpt_response, model))
    conn.commit()
    conn.close()

def get_chat_history(session_id):
    conn = get_db_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT user_query, gpt_response FROM application_logs WHERE session_id = ? ORDER BY created_at', (session_id,))
    messages = []
    for row in cursor.fetchall():
        messages.extend([
            {"role": "human", "content": row['user_query']},
            {"role": "ai", "content": row['gpt_response']}
        ])
    conn.close()
    return messages

# Initialize the database
create_application_logs()

In [None]:
session_id = str(uuid.uuid4())
q1 = "what are the contents in the report??"
chat_history = get_chat_history(session_id)
answer = ask_question(q1, chat_history, llm, retriever)
insert_application_logs(session_id, q1, answer, "gpt-4o-mini")
print(f"Human: {q1}")
print(f"AI: {answer}\n")


--- You asked: what are the contents in the report?? ---
Human: what are the contents in the report??
AI: The report consists of the following key sections:

1. **Introduction (Problem, Objectives)**  
   - Clear problem statement  
   - Well-defined main and sub-objectives  

2. **Methodology (Population, Sampling, Data Collection)**  
   - Detailed description of the population  
   - Sampling method  
   - Data collection process  

3. **Descriptive Statistics**  
   - Comprehensive presentation of descriptive statistics  
   - Correct interpretation of statistics  

4. **Data Visualization**  
   - Well-presented visualizations (e.g., charts, graphs)  
   - Relevant explanations of the visualizations  

5. **Hypothesis Testing**  
   - Correct use of statistical tests  
   - Accurate interpretation and explanation of results  

6. **Regression Analysis**  
   - Clear explanation of regression analysis  
   - Relevance of regression analysis to the study  

7. **Conclusion**  
   -

In [None]:
q2 = "how are the marks allocated in it?"
chat_history = get_chat_history(session_id)
answer2 = ask_question(q2, chat_history, llm, retriever)
insert_application_logs(session_id, q2, answer2, "gpt-4o-mini")
print(f"Human: {q2}")
print(f"AI: {answer2}\n")


--- You asked: how are the marks allocated in it? ---
Searching for: How are the marks allocated in the grading of the report?
Human: how are the marks allocated in it?
AI: The marks are allocated in the grading of the report as follows:

- Introduction (Problem, Objectives): 5 marks
- Methodology (Population, Sampling, Data Collection): 5 marks
- Descriptive Statistics: 5 marks
- Data Visualization: 5 marks
- Hypothesis Testing: 5 marks
- Regression Analysis: 5 marks
- Conclusion: 5 marks
- Overall Report Quality (Presentation, Formatting, References): 5 marks

In total, the report is graded out of 40 marks.

