In [2]:
# Load in all the libraries and documents needed for the project
import os 
import bs4
import markdown
import psycopg2
import requests

from datetime import datetime
from dotenv import load_dotenv
from pprint import pprint

from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.document_loaders  import TextLoader
from langchain.text_splitter  import RecursiveCharacterTextSplitter
from langchain.embeddings  import OpenAIEmbeddings
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredPDFLoader, WebBaseLoader, UnstructuredMarkdownLoader, UnstructuredWordDocumentLoader

from langchain.vectorstores.pgvector import PGVector
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

load_dotenv()

USER_AGENT environment variable not set, consider setting it to identify your requests.


True

### Load all the necessary API needed for this project

In [9]:
# Tracing via Langsmith
trace = os.getenv("LANGCHAIN_TRACING_V2")
langsmith = os.getenv("LANGCHAIN_API_KEY")

# Build a GPT model
gpt = ChatOpenAI(
    model = "gpt-4o",
    temperature=0.7
)

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    openai_api_key = os.getenv("OPENAI_API_KEY"),
)

response = gpt.invoke("Testing the connection are you able to receive my message?")
print(response)

Python(27649) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


content='Yes, I can receive your message! How can I assist you today?' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 15, 'prompt_tokens': 18, 'total_tokens': 33, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_7f6be3efb0', 'finish_reason': 'stop', 'logprobs': None} id='run-c00de63e-53e5-4699-b182-f72739c91f0f-0' usage_metadata={'input_tokens': 18, 'output_tokens': 15, 'total_tokens': 33, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}


## Load, split and chunk all of our documentations

In [None]:
# Load in all the documents 
pdf_filepath = "media/Jun Yeow's Resume _ 18_08_2024.pdf"
word_filepath = "media/Jun Yeow's Resume _ 18_08_2024.docx"
url = "https://johnyeow23.github.io/JunYeow-Website/"
markdown_path = "media/Jun Yeow's Resume.md"

In [None]:
#Facing issues with my PYPDF folder for some reason...
# pdf_loader  = PyPDFLoader(pdf_filepath)
pdf_loader  = UnstructuredPDFLoader(pdf_filepath, mode="elements")
print(pdf_loader)

print("------------------------------")

pdf_documents = pdf_loader.load()
print(pdf_documents)

print("------------------------------")

print(pdf_documents[0].page_content)
print(len(pdf_documents))

In [None]:
# Let's try word document instead
# word_loader = Docx2txtLoader(word_filepath, mode="elements")


word_loader = UnstructuredWordDocumentLoader(word_filepath, mode="elements")
print(word_loader)

print("------------------------------")

word_doc= word_loader.load()
pprint(word_doc)

print("------------------------------")

print(word_doc[0])

print("------------------------------")

print(len(word_doc))


In [None]:
# Website information
response = requests.get(url)
print(response)

web_loader = WebBaseLoader(
    web_path=(url),
)

web = web_loader.load()
pprint(web)
print(len(web))

In [None]:
# Markdown information
readme_loader = UnstructuredMarkdownLoader(markdown_path, mode="elements")

readme_data = readme_loader.load()

print(readme_data)
print(len(readme_data))
print(readme_data[7].page_content)

### We loaded the documents in now to split them into chunks

In [None]:
pdf_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)

pdf = pdf_splitter.split_documents(pdf_documents)

print(pdf)

for i in range(len(pdf)):
    print(pdf[i].page_content)
    print(pdf[i].metadata)
print(len(pdf))

In [None]:
pdf[0].metadata.keys()

In [None]:
word_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)

word = word_splitter.split_documents(word_doc)

print(word)

for i in range(len(word)):
    print(word[i].page_content)
print(len(word))

In [None]:
web_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

web_content = web_splitter.split_documents(web)

print(web_content)
print(len(web_content))

In [None]:
readme_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)

readme = readme_splitter.split_documents(readme_data)

print(readme)
print(len(readme))

In [None]:
# Let's create a combined list instead
combined = word + web_content + readme + pdf
print(type(combined))
print(len(combined))
print(combined[0])

### Let's embed this resume first before adding other informationn into the mix, like
    1. My personal website
    2. My readme.md
    3. Maybe a short description about myself documentation
    4. Recommendation letter from past employment 

In [14]:
connect_string = os.getenv("CONNECTION_STRING")

collect_word = os.getenv("COLLECTION_NAME_WORD")
collect_readme = os.getenv("COLLECTION_NAME_README")
collect_web = os.getenv("COLLECTION_NAME_WEB")

In [None]:
combined[0].metadata['source']

In [None]:
# Straight forward approach
vectorstore=PGVector(
    embedding_function=embeddings,
    collection_name=collect_word,
    connection_string=connect_string,
    use_jsonb=True,
)

vectors = vectorstore.add_documents(combined, ids=[doc.metadata["source"] for doc in combined])

In [None]:
# vectorstore.delete()

In [None]:
# # Update metadata in the database
# for doc in combined:
#     last_modified = doc.metadata.get("last_modified")
#     links = doc.metadata.get("links")
#     if last_modified or links:
#         # Assuming you have a method to update metadata in your PGVector class
#         vectorstore.collection_metadata(doc, last_modified=last_modified, links=links)

In [None]:
# Create information for each of the different datasource
# vectorstore_word=PGVector(
#     embedding_function=embeddings,
#     collection_name=collect_word,
#     connection_string=connect_string,
#     use_jsonb=True,
# )

# vectorstore_word.add_documents(word)

# vectorstore_readme=PGVector(
#     embedding_function=embeddings,
#     collection_name=collect_readme,
#     connection_string=connect_string,
#     use_jsonb=True,
# )

# vectorstore_word.add_documents(readme)

# vectorstore_web=PGVector(
#     embedding_function=embeddings,
#     collection_name=collect_web,
#     connection_string=connect_string,
#     use_jsonb=True,
# )

# vectorstore_word.add_documents(web_content)

### Let's checkout if the rows exist within our SQL table.
### Before using similarity search to find relevant information to our query

In [None]:
# Test the db 
query = "Did Jun Yeow work in Grab?"

similar = vectorstore.similarity_search_with_score(query, k=5)

for doc in similar:
    print('-------------')
    print(doc[0].page_content)
    print('-------------')
    print(doc[1])

In [None]:
retriever = vectorstore.as_retriever()

In [None]:
system_prompt = (
    "You are an AI assistant designed to answer questions from hiring managers and recruiters "
    "regarding Jun Yeow's professional background, skills, and experiences. Utilize the provided "
    "context to deliver accurate and concise responses. If the information is not available in the "
    "context, respond with 'I'm sorry, but I don't have that information.' "
    "maximum of three sentences."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate(
    [
        ('system', system_prompt),
        ('human', "{input}")
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(gpt, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.batch(
    [
        {"input": "Hey tell me a little about Jun Yeow"}, 
        {"input": "Can you tell me more about Jun Yeow's work in Grab?"},
        {"input": "Can I have Jun Yeow's Linkedin?"},
        {"input": "What kind of skills does Jun Yeow have?"},
        {"input": "Can you tell me Jun Yeow's contribution to DAC"},
        {"input": "What makes him good as a Data scientist?"}
    ]
)

for answer in response:
    print(answer["answer"])

### Wah shaggy as we can see the rag system isn't really good at replying our answer other then basic questions let's tune it and evaluate the model better.

#### There are many ways to approach this 
    1) Better quality data more descriptive and well documented information instead of bits and pieces of information from everywhere
    2) Evaluating/Fine tuning RAG system 

In [None]:
# Trying out the newly formatted information instead
new_loader = Docx2txtLoader("media/Jun_Yeow_Organized_Profile.docx")

new_data = new_loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

data = splitter.split_documents(new_data)
print(data)
print(len(data))

In [None]:
load_dotenv()

In [12]:
string = os.getenv("NEW_CONNECTION_STRING")
name = os.getenv("NEW_COLLECTION_NAME")

new_vectorstore=PGVector(
    embedding_function=embeddings,
    collection_name=name,
    connection_string=string,
    use_jsonb=True,
)

# vectors = new_vectorstore.add_documents(data)

In [None]:
test_retriever = new_vectorstore.as_retriever()

system_prompt = (
    "You are an AI assistant designed to answer questions from hiring managers and recruiters "
    "regarding Jun Yeow's professional background, skills, and experiences. Utilize the provided "
    "context to deliver accurate and concise responses. If the information is not available in the "
    "context, respond with 'I'm sorry, but I don't have that information.' "
    "maximum of three sentences."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate(
    [
        ('system', system_prompt),
        ('human', "{input}")
    ]
)

question_answer_chain = create_stuff_documents_chain(gpt, prompt)
rag_chain = create_retrieval_chain(test_retriever, question_answer_chain)

response = rag_chain.batch(
    [
        {"input": "Hey tell me a little about Jun Yeow"}, 
        {"input": "Can you tell me more about Jun Yeow's work in Grab?"},
        {"input": "Can I have Jun Yeow's Linkedin?"},
        {"input": "What kind of skills does Jun Yeow have?"},
        {"input": "Can you tell me Jun Yeow's contribution to DAC"},
        {"input": "What makes him good as a Data scientist?"}
    ]
)

for answer in response:
    print(answer["answer"])

# Does seem to have an improvement to the entire quality of answers when we use better data

In [None]:
from typing import Sequence

from langchain_core.messages import BaseMessage
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, StateGraph
from langgraph.graph.message import add_messages
from typing_extensions import Annotated, TypedDict

from langchain_community.document_loaders import (
    UnstructuredMarkdownLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredPDFLoader,
    WebBaseLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [16]:
# Example usage:
pdf_filepath = "media/Jun Yeow's Resume _ 18_08_2024.pdf"
word_filepath = "media/Jun Yeow's Resume _ 18_08_2024.docx"
url = "https://johnyeow23.github.io/JunYeow-Website/"
markdown_path = "media/Jun Yeow's Resume.md"
docx_path = "media/Jun_Yeow_Organized_Profile.docx"

def load_and_split_document(source, source_type, chunk_size=2000, chunk_overlap=200):
    if source_type == 'markdown':
        loader = UnstructuredMarkdownLoader(source, mode="elements")
    elif source_type == 'word':
        loader = UnstructuredWordDocumentLoader(source, mode="elements")
    elif source_type == 'pdf':
        loader = UnstructuredPDFLoader(source, mode="elements")
    elif source_type == 'url':
        loader = WebBaseLoader(web_path=source)
    else:
        raise ValueError(f"Unsupported source_type: {source_type}")

    documents = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    split_documents = splitter.split_documents(documents)
    return split_documents

pdf_docs = load_and_split_document(pdf_filepath, 'pdf')
word_docs = load_and_split_document(word_filepath, 'word')
url_docs = load_and_split_document(url, 'url')
markdown_docs = load_and_split_document(markdown_path, 'markdown')
docx_docs = load_and_split_document(docx_path, 'word')

combined = pdf_docs + word_docs + url_docs + markdown_docs + docx_docs

In [None]:
print(url_docs[0].metadata["source"])

{'source': 'https://johnyeow23.github.io/JunYeow-Website/', 'title': "Jun Yeow's Portfolio", 'language': 'No language found.'}


In [36]:
# Create the database storing the embeddings
db=PGVector(
    embedding_function=embeddings,
    collection_name=collect_word,
    connection_string=connect_string,
    use_jsonb=True,
)

db.add_documents(combined, ids=[chunk.metadata["source"] for chunk in combined])

test_retriever = db.as_retriever()

In [None]:
system_prompt = (
    "You are an AI assistant designed to answer questions from hiring managers and recruiters "
    "regarding Jun Yeow's professional background, skills, and experiences. Utilize the provided "
    "context to deliver accurate and concise responses. If the information is not available in the "
    "context, respond with 'I'm sorry, but I don't have that information.' "
    "maximum of three sentences."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate(
    [
        ('system', system_prompt),
        ('human', "{input}")
    ]
)

question_answer_chain = create_stuff_documents_chain(gpt, prompt)
rag_chain = create_retrieval_chain(test_retriever, question_answer_chain)


class State(TypedDict):
    input: str
    chat_history: Annotated[Sequence[BaseMessage], add_messages]
    context: str
    answer: str

In [None]:
def call_model(state: State):
    response = rag_chain.invoke(state)
    return {
        "chat_history":[
            HumanMessage(state["input"]),
            AIMessage(response["answer"]),
        ],
        "context": response["context"],
        "answer": response["answer"],
    }

In [None]:
workflow = Stategraph(state_schema=State)
workflow


#### Scratch pad
    1) Evaluation using langsmith? Not too sure of the evaluation methods need to research.
    2) Build out the entire system.
    3) One more for the question and answers.

In [None]:
# Let's create the database to fit our needs a little better

# Connect to PostgreSQL database
conn = psycopg2.connect(
    dbname="johnresume_db",
    user=os.getenv("POSTGRES_USER"),
    password=os.getenv("POSTGRES_PASSWORD"),
    host="localhost",  # Or your host address
    port="5432"        # Default PostgreSQL port
)
cursor = conn.cursor()

In [None]:
# Add new columns if they don't already exist
try:
    # cursor.execute("ALTER TABLE langchain_pg_embedding ADD COLUMN IF NOT EXISTS index INTEGER;")
    cursor.execute("ALTER TABLE langchain_pg_embedding ADD COLUMN IF NOT EXISTS created_datetime TIMESTAMP;")
except Exception as e:
    print(f"Error adding columns: {e}")

In [None]:
# Example data
current_time = datetime.now()

# Insert data into the table
for index in range(len(word)):
    try:
        cursor.execute(
            # "INSERT INTO langchain_pg_embedding (index, created_datetime) VALUES (%s, %s)",
            "INSERT INTO langchain_pg_embedding (created_datetime) VALUES (%s)",
            # (index, current_time)
            (current_time)
        )
    except Exception as e:
        print(f"Error inserting data: {e}")

In [None]:
# Commit and close connection
conn.commit()
cursor.close()
conn.close()