In [34]:
# Load in all the libraries and documents needed for the project
import os 
import bs4
import markdown
import psycopg2
import requests

from datetime import datetime
from dotenv import load_dotenv
from pprint import pprint

from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.document_loaders  import TextLoader
from langchain.text_splitter  import RecursiveCharacterTextSplitter
from langchain.embeddings  import OpenAIEmbeddings
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from langchain.vectorstores.pgvector import PGVector
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

load_dotenv()

True

### Load all the necessary API needed for this project

In [None]:
# Tracing 
trace = os.getenv("LANGCHAIN_TRACING_V2")
langsmith = os.getenv("LANGCHAIN_API_KEY")

In [None]:
gpt = ChatOpenAI(
    model = "gpt-4o",
    temperature=0.7
)

In [None]:
gpt.invoke("Testing the connection are you able to receive my message?")

In [None]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    openai_api_key = os.getenv("OPENAI_API_KEY"),
)

## Load, split and chunk all of our documentations

In [None]:
pdf_filepath = "media/Jun Yeow's Resume _ 18_08_2024.pdf"
word_filepath = "media/Jun Yeow's Resume _ 18_08_2024.docx"

In [None]:
#Facing issues with my PYPDF folder for some reason...
pdf_loader  = PyPDFLoader(pdf_filepath)
print(pdf_loader)

print("------------------------------")

pdf_documents = pdf_loader.load()
print(pdf_documents)

print("------------------------------")

print(pdf_documents[0].page_content)
print(len(pdf_documents))

In [None]:
# Let's try word document instead
word_loader = Docx2txtLoader(word_filepath)
print(word_loader)

print("------------------------------")

word_doc= word_loader.load()
pprint(word_doc)

print("------------------------------")

print(word_doc[0])

print("------------------------------")

print(len(word_doc))

In [None]:
# Website information
url = "https://johnyeow23.github.io/JunYeow-Website/"
response = requests.get(url)
print(response)

web_loader = WebBaseLoader(
    web_path=(url),
)

web = web_loader.load()
pprint.pprint(web)
print(len(web))

In [None]:
# Markdown information
markdown_path = "media/Jun Yeow's Resume.md"

readme_loader = UnstructuredMarkdownLoader(markdown_path, mode="elements")

readme_data = readme_loader.load()

print(readme_data)
print(len(readme_data))
print(readme_data[7].page_content)

### We loaded the documents in now to split them into chunks

In [None]:
word_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)

word = word_splitter.split_documents(word_doc)

print(word)

for i in range(len(word)):
    print(word[i].page_content)
print(len(word)) # 4 Chunks Only

In [None]:
web_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=100)

web_content = web_splitter.split_documents(web)

print(web_content)

In [None]:
readme_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)

readme = readme_splitter.split_documents(readme_data)

print(readme)

In [None]:
# Let's create a combined list instead
combined = word + web_content + readme
print(type(combined))
print(len(combined))
print(combined[0])

### Let's embed this resume first before adding other informationn into the mix, like
    1. My personal website
    2. My readme.md
    3. Maybe a short description about myself documentation
    4. Recommendation letter from past employment 

In [None]:
connect_string = os.getenv("CONNECTION_STRING")
collect_word = os.getenv("COLLECTION_NAME_WORD")
collect_readme = os.getenv("COLLECTION_NAME_README")
collect_web = os.getenv("COLLECTION_NAME_WEB")

In [None]:
# Straight forward approach
vectorstore=PGVector(
    embedding_function=embeddings,
    collection_name=collect_word,
    connection_string=connect_string,
    use_jsonb=True,
)

vectors = vectorstore.add_documents(combined)

In [None]:
# Create information for each of the different datasource
# vectorstore_word=PGVector(
#     embedding_function=embeddings,
#     collection_name=collect_word,
#     connection_string=connect_string,
#     use_jsonb=True,
# )

# vectorstore_word.add_documents(word)

# vectorstore_readme=PGVector(
#     embedding_function=embeddings,
#     collection_name=collect_readme,
#     connection_string=connect_string,
#     use_jsonb=True,
# )

# vectorstore_word.add_documents(readme)

# vectorstore_web=PGVector(
#     embedding_function=embeddings,
#     collection_name=collect_web,
#     connection_string=connect_string,
#     use_jsonb=True,
# )

# vectorstore_word.add_documents(web_content)

### Let's checkout if the rows exist within our SQL table.
### Before using similarity search to find relevant information to our query

In [None]:
# Test the db 
query = "Did Jun Yeow work in Grab?"

similar = vectorstore.similarity_search_with_score(query, k=5)

for doc in similar:
    print('-------------')
    print(doc[0].page_content)
    print('-------------')
    print(doc[1])

In [None]:
retriever = vectorstore.as_retriever()

In [46]:
system_prompt = (
    "You are an AI assistant designed to answer questions from hiring managers and recruiters "
    "regarding Jun Yeow's professional background, skills, and experiences. Utilize the provided "
    "context to deliver accurate and concise responses. If the information is not available in the "
    "context, respond with 'I'm sorry, but I don't have that information.' "
    "maximum of three sentences."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate(
    [
        ('system', system_prompt),
        ('human', "{input}")
    ]
)

In [37]:
question_answer_chain = create_stuff_documents_chain(gpt, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [48]:
response = rag_chain.batch(
    [
        {"input": "Hey tell me a little about Jun Yeow"}, 
        {"input": "Can you tell me more about Jun Yeow's work in Grab?"},
        {"input": "Can I have Jun Yeow's Linkedin?"},
        {"input": "What kind of skills does Jun Yeow have?"}
    ]
)

for answer in response:
    print(answer["answer"])

Jun Yeow is a Data Science student at the University of London with a strong passion for Machine Learning. He is pursuing a Bachelor of Science (Honours) in Data Science and Business Analytics and is currently in his penultimate year.
Jun Yeow worked as a People Data Analytics Intern at Grab from January 2024 to August 2024, focusing on Human Resource Analytics. During this internship, he worked on projects aimed at improving the employee experience.
I'm sorry, but I don't have that information.
I'm sorry, but I don't have that information.


### Wah shaggy as we can see the rag system isn't really good at replying our answer other then basic questions let's tune it and evaluate the model better.

In [None]:
# Let's create the database to fit our needs a little better

# Connect to PostgreSQL database
conn = psycopg2.connect(
    dbname="johnresume_db",
    user=os.getenv("POSTGRES_USER"),
    password=os.getenv("POSTGRES_PASSWORD"),
    host="localhost",  # Or your host address
    port="5432"        # Default PostgreSQL port
)
cursor = conn.cursor()

In [None]:
# Add new columns if they don't already exist
try:
    # cursor.execute("ALTER TABLE langchain_pg_embedding ADD COLUMN IF NOT EXISTS index INTEGER;")
    cursor.execute("ALTER TABLE langchain_pg_embedding ADD COLUMN IF NOT EXISTS created_datetime TIMESTAMP;")
except Exception as e:
    print(f"Error adding columns: {e}")

In [None]:
# Example data
current_time = datetime.now()

# Insert data into the table
for index in range(len(word)):
    try:
        cursor.execute(
            # "INSERT INTO langchain_pg_embedding (index, created_datetime) VALUES (%s, %s)",
            "INSERT INTO langchain_pg_embedding (created_datetime) VALUES (%s)",
            # (index, current_time)
            (current_time)
        )
    except Exception as e:
        print(f"Error inserting data: {e}")

In [None]:
# Commit and close connection
conn.commit()
cursor.close()
conn.close()

## Create vector database to store all our items within