In [73]:
# Load in all the libraries and documents needed for the project
import os 
import bs4
import markdown
import psycopg2
import requests

from datetime import datetime
from dotenv import load_dotenv
from pprint import pprint

from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain.document_loaders  import TextLoader
from langchain.text_splitter  import RecursiveCharacterTextSplitter
from langchain.embeddings  import OpenAIEmbeddings
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from langchain.vectorstores.pgvector import PGVector
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

load_dotenv()

True

### Load all the necessary API needed for this project

In [None]:
# Tracing 
trace = os.getenv("LANGCHAIN_TRACING_V2")
langsmith = os.getenv("LANGCHAIN_API_KEY")

In [None]:
gpt = ChatOpenAI(
    model = "gpt-4o",
    temperature=0.7
)

In [None]:
gpt.invoke("Testing the connection are you able to receive my message?")

In [None]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    openai_api_key = os.getenv("OPENAI_API_KEY"),
)

## Load, split and chunk all of our documentations

In [69]:
pdf_filepath = "media/Jun Yeow's Resume _ 18_08_2024.pdf"
word_filepath = "media/Jun Yeow's Resume _ 18_08_2024.docx"

In [70]:
#Facing issues with my PYPDF folder for some reason...
pdf_loader  = PyPDFLoader(pdf_filepath)
print(pdf_loader)

print("------------------------------")

pdf_documents = pdf_loader.load()
print(pdf_documents)

print("------------------------------")

print(pdf_documents[0].page_content)
print(len(pdf_documents))

<langchain_community.document_loaders.pdf.PyPDFLoader object at 0x1480c85c0>
------------------------------
[Document(metadata={'source': "media/Jun Yeow's Resume _ 18_08_2024.pdf", 'page': 0}, page_content='YEOJUNYEOW| PenultimatestudentinDataScienceandBusinessAnalyticsUndergraduate 93721423| Email | LinkedIn| GitHub| WebsiteEDUCATIONUniversityofLondon(UOL)|Aug2022–PresentBachelorofScience(Honours)inDataScienceandBusinessAnalytics● Relevantcoursework:IntroductiontoProgramminginPythonandR,AdvancedStatisticsinDistributionTheory,AdvancedStatisticsinStatisticsInferenceandSQL● CurrentGrade:“A”(OntracktoFirstClassHonours)WORKEXPERIENCEfoodpanda|LogisticsAnalyticsEngineerInternship|Aug2024–Present● OptimisedqueryperformancebybuildingandtestingdatatransformationtablesusingdbtandSQL,resultinginimproveddatareliabilityand10%fasterreportingtimes.● Workingonincreasingdeliverydriversupplyby10%byanalysingriderdataimplementingapredictivemodel forpeakdemandperiodsandreducingordercancellations.● Workin

In [71]:
# Let's try word document instead
word_loader = Docx2txtLoader(word_filepath)
print(word_loader)

print("------------------------------")

word_doc= word_loader.load()
pprint(word_doc)

print("------------------------------")

print(word_doc[0])

print("------------------------------")

print(len(word_doc))

<langchain_community.document_loaders.word_document.Docx2txtLoader object at 0x1480cdaf0>
------------------------------
[Document(metadata={'source': "media/Jun Yeow's Resume _ 18_08_2024.docx"}, page_content='YEO JUN YEOW | Penultimate student in Data Science and Business Analytics Undergraduate\xa0\n93721423 | Email | LinkedIn | GitHub | Website\n\nEDUCATION\n\nUniversity of London (UOL) | Aug 2022 – Present\n\nBachelor of Science (Honours) in Data Science and Business Analytics \n\nRelevant coursework: Introduction to Programming in Python and R, Advanced Statistics in Distribution Theory, Advanced Statistics in Statistics Inference and SQL\n\nCurrent Grade: “A” (On track to First Class Honours)\n\nWORK EXPERIENCE\n\nfoodpanda | Logistics Analytics Engineer Internship | Aug 2024 – Present\n\nOptimised query performance by building and testing data transformation tables using dbt and SQL, resulting in improved data reliability and 10% faster reporting times.\n\nWorking on increasing

In [75]:
# Website information
url = "https://johnyeow23.github.io/JunYeow-Website/"
response = requests.get(url)
print(response)

web_loader = WebBaseLoader(
    web_path=(url),
)

web = web_loader.load()
pprint(web)
print(len(web))

<Response [200]>
[Document(metadata={'source': 'https://johnyeow23.github.io/JunYeow-Website/', 'title': "Jun Yeow's Portfolio", 'language': 'No language found.'}, page_content="\n\n\nJun Yeow's Portfolio\n\n\n\n\n\n\n\n\n\n\n\nWelcome\nWork\nProjects\nLeadership\nContact Me!!\n\n\n\n\n\n\n\n\n\nHi there! I am Jun Yeow \nI am a Data Science student at the University of London with a strong passion for Machine Learning \r\n\t\t\t\t\t\t\tand Analytics. Explore my projects and work experiences below, and learn more about me on my personal \r\n\t\t\t\t\t\t\twebsite or from my \r\n\t\t\t\t\t\t\t\tresume and my \r\n\t\t\t\t\t\t\t\tLinkedIn.\n\n\n\nTechnical Skills\n\n\n\n\n\nSQL\r\n\t\t\t\t\t\t\t\t\t\n\n\nPython\r\n\t\t\t\t\t\t\t\t\t\n\n\nGit\r\n\t\t\t\t\t\t\t\t\t\n\n\nPowerBI\r\n\t\t\t\t\t\t\t\t\t\n\n\nTableau\r\n\t\t\t\t\t\t\t\t\t\n\n\nScikit-Learn\r\n\t\t\t\t\t\t\t\t\t\n\n\nSeaborn\r\n\t\t\t\t\t\t\t\t\t\n\n\nPandas\r\n\t\t\t\t\t\t\t\t\t\n\n\nNumPy\r\n\t\t\t\t\t\t\t\t\t\n\n\nMatplotlib\r\n

In [76]:
# Markdown information
markdown_path = "media/Jun Yeow's Resume.md"

readme_loader = UnstructuredMarkdownLoader(markdown_path, mode="elements")

readme_data = readme_loader.load()

print(readme_data)
print(len(readme_data))
print(readme_data[7].page_content)

[Document(metadata={'source': "media/Jun Yeow's Resume.md", 'category_depth': 0, 'languages': ['eng'], 'file_directory': 'media', 'filename': "Jun Yeow's Resume.md", 'filetype': 'text/markdown', 'last_modified': '2024-11-16T17:03:21', 'category': 'Title', 'element_id': 'a367c5494622850350cbe80b8ee98c1a'}, page_content='# Hello!! I am Jun Yeow 👋'), Document(metadata={'source': "media/Jun Yeow's Resume.md", 'category_depth': 2, 'emphasized_text_contents': ['Aspiring AI Engineer | With a Passion in Data Science and Data Engineering'], 'emphasized_text_tags': ['b'], 'languages': ['eng'], 'file_directory': 'media', 'filename': "Jun Yeow's Resume.md", 'filetype': 'text/markdown', 'last_modified': '2024-11-16T17:03:21', 'parent_id': 'a367c5494622850350cbe80b8ee98c1a', 'category': 'Title', 'element_id': 'cbf8940cb129844d4e74d6ebdfa4be15'}, page_content='Aspiring AI Engineer | With a Passion in Data Science and Data Engineering'), Document(metadata={'source': "media/Jun Yeow's Resume.md", 'lang

### We loaded the documents in now to split them into chunks

In [90]:
word_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=400)

word = word_splitter.split_documents(word_doc)

print(word)

for i in range(len(word)):
    print(word[i].page_content)
print(len(word)) # 4 Chunks Only

[Document(metadata={'source': "media/Jun Yeow's Resume _ 18_08_2024.docx"}, page_content='YEO JUN YEOW | Penultimate student in Data Science and Business Analytics Undergraduate\xa0\n93721423 | Email | LinkedIn | GitHub | Website\n\nEDUCATION\n\nUniversity of London (UOL) | Aug 2022 – Present\n\nBachelor of Science (Honours) in Data Science and Business Analytics \n\nRelevant coursework: Introduction to Programming in Python and R, Advanced Statistics in Distribution Theory, Advanced Statistics in Statistics Inference and SQL\n\nCurrent Grade: “A” (On track to First Class Honours)\n\nWORK EXPERIENCE\n\nfoodpanda | Logistics Analytics Engineer Internship | Aug 2024 – Present\n\nOptimised query performance by building and testing data transformation tables using dbt and SQL, resulting in improved data reliability and 10% faster reporting times.\n\nWorking on increasing delivery driver supply by 10% by analysing rider data implementing a predictive model for peak demand periods and reduci

In [91]:
web_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

web_content = web_splitter.split_documents(web)

print(web_content)
print(len(web_content))

[Document(metadata={'source': 'https://johnyeow23.github.io/JunYeow-Website/', 'title': "Jun Yeow's Portfolio", 'language': 'No language found.'}, page_content="Jun Yeow's Portfolio\n\n\n\n\n\n\n\n\n\n\n\nWelcome\nWork\nProjects\nLeadership\nContact Me!!\n\n\n\n\n\n\n\n\n\nHi there! I am Jun Yeow \nI am a Data Science student at the University of London with a strong passion for Machine Learning \r\n\t\t\t\t\t\t\tand Analytics. Explore my projects and work experiences below, and learn more about me on my personal \r\n\t\t\t\t\t\t\twebsite or from my \r\n\t\t\t\t\t\t\t\tresume and my \r\n\t\t\t\t\t\t\t\tLinkedIn.\n\n\n\nTechnical Skills\n\n\n\n\n\nSQL\r\n\t\t\t\t\t\t\t\t\t\n\n\nPython\r\n\t\t\t\t\t\t\t\t\t\n\n\nGit\r\n\t\t\t\t\t\t\t\t\t\n\n\nPowerBI\r\n\t\t\t\t\t\t\t\t\t\n\n\nTableau\r\n\t\t\t\t\t\t\t\t\t\n\n\nScikit-Learn\r\n\t\t\t\t\t\t\t\t\t\n\n\nSeaborn\r\n\t\t\t\t\t\t\t\t\t\n\n\nPandas\r\n\t\t\t\t\t\t\t\t\t\n\n\nNumPy\r\n\t\t\t\t\t\t\t\t\t\n\n\nMatplotlib\r\n\t\t\t\t\t\t\t\t\t\n\n\

In [92]:
readme_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)

readme = readme_splitter.split_documents(readme_data)

print(readme)
print(len(readme))

[Document(metadata={'source': "media/Jun Yeow's Resume.md", 'category_depth': 0, 'languages': ['eng'], 'file_directory': 'media', 'filename': "Jun Yeow's Resume.md", 'filetype': 'text/markdown', 'last_modified': '2024-11-16T17:03:21', 'category': 'Title', 'element_id': 'a367c5494622850350cbe80b8ee98c1a'}, page_content='# Hello!! I am Jun Yeow 👋'), Document(metadata={'source': "media/Jun Yeow's Resume.md", 'category_depth': 2, 'emphasized_text_contents': ['Aspiring AI Engineer | With a Passion in Data Science and Data Engineering'], 'emphasized_text_tags': ['b'], 'languages': ['eng'], 'file_directory': 'media', 'filename': "Jun Yeow's Resume.md", 'filetype': 'text/markdown', 'last_modified': '2024-11-16T17:03:21', 'parent_id': 'a367c5494622850350cbe80b8ee98c1a', 'category': 'Title', 'element_id': 'cbf8940cb129844d4e74d6ebdfa4be15'}, page_content='Aspiring AI Engineer | With a Passion in Data Science and Data Engineering'), Document(metadata={'source': "media/Jun Yeow's Resume.md", 'lang

In [93]:
# Let's create a combined list instead
combined = word + web_content + readme
print(type(combined))
print(len(combined))
print(combined[0])

<class 'list'>
26
page_content='YEO JUN YEOW | Penultimate student in Data Science and Business Analytics Undergraduate 
93721423 | Email | LinkedIn | GitHub | Website

EDUCATION

University of London (UOL) | Aug 2022 – Present

Bachelor of Science (Honours) in Data Science and Business Analytics 

Relevant coursework: Introduction to Programming in Python and R, Advanced Statistics in Distribution Theory, Advanced Statistics in Statistics Inference and SQL

Current Grade: “A” (On track to First Class Honours)

WORK EXPERIENCE

foodpanda | Logistics Analytics Engineer Internship | Aug 2024 – Present

Optimised query performance by building and testing data transformation tables using dbt and SQL, resulting in improved data reliability and 10% faster reporting times.

Working on increasing delivery driver supply by 10% by analysing rider data implementing a predictive model for peak demand periods and reducing order cancellations.' metadata={'source': "media/Jun Yeow's Resume _ 18_08_20

### Let's embed this resume first before adding other informationn into the mix, like
    1. My personal website
    2. My readme.md
    3. Maybe a short description about myself documentation
    4. Recommendation letter from past employment 

In [94]:
connect_string = os.getenv("CONNECTION_STRING")

collect_word = os.getenv("COLLECTION_NAME_WORD")
collect_readme = os.getenv("COLLECTION_NAME_README")
collect_web = os.getenv("COLLECTION_NAME_WEB")

In [95]:
# Straight forward approach
vectorstore=PGVector(
    embedding_function=embeddings,
    collection_name=collect_word,
    connection_string=connect_string,
    use_jsonb=True,
)

vectors = vectorstore.add_documents(combined)

In [None]:
# Create information for each of the different datasource
# vectorstore_word=PGVector(
#     embedding_function=embeddings,
#     collection_name=collect_word,
#     connection_string=connect_string,
#     use_jsonb=True,
# )

# vectorstore_word.add_documents(word)

# vectorstore_readme=PGVector(
#     embedding_function=embeddings,
#     collection_name=collect_readme,
#     connection_string=connect_string,
#     use_jsonb=True,
# )

# vectorstore_word.add_documents(readme)

# vectorstore_web=PGVector(
#     embedding_function=embeddings,
#     collection_name=collect_web,
#     connection_string=connect_string,
#     use_jsonb=True,
# )

# vectorstore_word.add_documents(web_content)

### Let's checkout if the rows exist within our SQL table.
### Before using similarity search to find relevant information to our query

In [96]:
# Test the db 
query = "Did Jun Yeow work in Grab?"

similar = vectorstore.similarity_search_with_score(query, k=5)

for doc in similar:
    print('-------------')
    print(doc[0].page_content)
    print('-------------')
    print(doc[1])

-------------
# Hello!! I am Jun Yeow 👋
-------------
0.4432735873242737
-------------
YEO JUN YEOW | Penultimate student in Data Science and Business Analytics Undergraduate 
93721423 | Email | LinkedIn | GitHub | Website

EDUCATION

University of London (UOL) | Aug 2022 – Present

Bachelor of Science (Honours) in Data Science and Business Analytics 

Relevant coursework: Introduction to Programming in Python and R, Advanced Statistics in Distribution Theory, Advanced Statistics in Statistics Inference and SQL

Current Grade: “A” (On track to First Class Honours)

WORK EXPERIENCE

foodpanda | Logistics Analytics Engineer Internship | Aug 2024 – Present

Optimised query performance by building and testing data transformation tables using dbt and SQL, resulting in improved data reliability and 10% faster reporting times.

Working on increasing delivery driver supply by 10% by analysing rider data implementing a predictive model for peak demand periods and reducing order cancellations.
-

In [97]:
retriever = vectorstore.as_retriever()

In [98]:
system_prompt = (
    "You are an AI assistant designed to answer questions from hiring managers and recruiters "
    "regarding Jun Yeow's professional background, skills, and experiences. Utilize the provided "
    "context to deliver accurate and concise responses. If the information is not available in the "
    "context, respond with 'I'm sorry, but I don't have that information.' "
    "maximum of three sentences."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate(
    [
        ('system', system_prompt),
        ('human', "{input}")
    ]
)

In [99]:
question_answer_chain = create_stuff_documents_chain(gpt, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [102]:
response = rag_chain.batch(
    [
        {"input": "Hey tell me a little about Jun Yeow"}, 
        {"input": "Can you tell me more about Jun Yeow's work in Grab?"},
        {"input": "Can I have Jun Yeow's Linkedin?"},
        {"input": "What kind of skills does Jun Yeow have?"},
        {"input": "Can you tell me Jun Yeow's contribution to DAC"},
        {"input": "What makes him good as a Data scientist?"}
    ]
)

for answer in response:
    print(answer["answer"])

Jun Yeow is a penultimate student pursuing a Bachelor of Science (Honours) in Data Science and Business Analytics at the University of London, currently on track for First Class Honours. He has experience as a Logistics Analytics Engineer Intern at Foodpanda and previously worked as a People Data Analytics Intern at Grab. His skills include programming in Python, R, and SQL, as well as proficiency in data tools like PowerBI and Tableau.
At Grab, Jun Yeow served as a People Data Analytics Intern from January 2024 to August 2024. He led a Data Literacy Program across six countries, improving proficiency for over 80 employees by 40%. He also enhanced employee performance by 15% through optimized meeting cadences, developed PowerBI dashboards that increased space efficiency by 20% resulting in $3.51M in cost savings, and built an LLM-based RAG system for workforce reports, reducing time-to-hire by two weeks across markets.
Yes, Jun Yeow's LinkedIn profile can be found at [Profile](https://

### Wah shaggy as we can see the rag system isn't really good at replying our answer other then basic questions let's tune it and evaluate the model better.

#### There are many ways to approach this 
    1) Better quality data more descriptive and well documented information instead of bits and pieces of information from everywhere
    2) Evaluating/Fine tuning RAG system 

In [65]:
# Trying out the newly formatted information instead
new_loader = Docx2txtLoader("media/Jun_Yeow_Organized_Profile.docx")

new_data = new_loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

data = splitter.split_documents(new_data)
print(data)
print(len(data))

[Document(metadata={'source': 'media/Jun_Yeow_Organized_Profile.docx'}, page_content="Jun Yeow's Profile\n\nContact Information\n\n- Name: Yeo Jun Yeow\n- Current Role: Penultimate student in Data Science and Business Analytics Undergraduate\n- Phone: 93721423\n- Email: [Contact Email](mailto:junyeow27@gmail.com)\n- LinkedIn: [Profile]( https://www.linkedin.com/in/junyeow/)\n- GitHub: [JohnYeow23](https://github.com/JohnYeow23)\n- Website: [JunYeow-Website](https://johnyeow23.github.io/JunYeow-Website/)\n\nEducation\n\nUniversity of London (UOL) | Aug 2022 – Present\n- Degree: Bachelor of Science (Honours) in Data Science and Business Analytics\n- Relevant Coursework:\n  - Introduction to Programming in Python and R\n  - Advanced Statistics (Distribution Theory and Statistical Inference)\n  - SQL\n- Current Grade: A (On track to First Class Honours)\n\nWork Experience\n\nFoodpanda | Logistics Analytics Engineer Intern | Aug 2024 – Present"), Document(metadata={'source': 'media/Jun_Yeow

In [66]:
load_dotenv()

True

In [67]:
string = os.getenv("NEW_CONNECTION_STRING")
name = os.getenv("NEW_COLLECTION_NAME")

new_vectorstore=PGVector(
    embedding_function=embeddings,
    collection_name=name,
    connection_string=string,
    use_jsonb=True,
)

vectors = new_vectorstore.add_documents(data)

In [None]:
test_retriever = new_vectorstore.as_retriever()

system_prompt = (
    "You are an AI assistant designed to answer questions from hiring managers and recruiters "
    "regarding Jun Yeow's professional background, skills, and experiences. Utilize the provided "
    "context to deliver accurate and concise responses. If the information is not available in the "
    "context, respond with 'I'm sorry, but I don't have that information.' "
    "maximum of three sentences."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate(
    [
        ('system', system_prompt),
        ('human', "{input}")
    ]
)

question_answer_chain = create_stuff_documents_chain(gpt, prompt)
rag_chain = create_retrieval_chain(test_retriever, question_answer_chain)

response = rag_chain.batch(
    [
        {"input": "Hey tell me a little about Jun Yeow"}, 
        {"input": "Can you tell me more about Jun Yeow's work in Grab?"},
        {"input": "Can I have Jun Yeow's Linkedin?"},
        {"input": "What kind of skills does Jun Yeow have?"},
        {"input": "Can you tell me Jun Yeow's contribution to DAC"},
        {"input": "What makes him good as a Data scientist?"}
    ]
)

for answer in response:
    print(answer["answer"])

# Does seem to have an improvement to the entire quality of answers when we use better data

Jun Yeow is a penultimate student pursuing a Bachelor of Science (Honours) in Data Science and Business Analytics at the University of London, currently on track for First Class Honours. He has interned as a Logistics Analytics Engineer at Foodpanda and as a People Data Analytics Intern at Grab, where he led a Data Literacy Program and developed PowerBI dashboards. Jun Yeow also has experience as a Python Coding Instructor at Empire Code and has led projects such as the Uber Global X Barcelona Hackathon and the FDM X DAC Project.
Jun Yeow worked as a People Data Analytics Intern at Grab from January 2024 to August 2024. During this time, he led a Data Literacy Program across 6 countries, improving proficiency by 40% for over 80 employees. He also developed PowerBI dashboards to increase space efficiency by 20%, saving $3.51M in costs, and built an LLM-based RAG system for workforce reports, reducing time-to-hire by 2 weeks across markets.
You can view Jun Yeow's LinkedIn profile at [th

In [None]:
# Let's create the database to fit our needs a little better

# Connect to PostgreSQL database
conn = psycopg2.connect(
    dbname="johnresume_db",
    user=os.getenv("POSTGRES_USER"),
    password=os.getenv("POSTGRES_PASSWORD"),
    host="localhost",  # Or your host address
    port="5432"        # Default PostgreSQL port
)
cursor = conn.cursor()

In [None]:
# Add new columns if they don't already exist
try:
    # cursor.execute("ALTER TABLE langchain_pg_embedding ADD COLUMN IF NOT EXISTS index INTEGER;")
    cursor.execute("ALTER TABLE langchain_pg_embedding ADD COLUMN IF NOT EXISTS created_datetime TIMESTAMP;")
except Exception as e:
    print(f"Error adding columns: {e}")

In [None]:
# Example data
current_time = datetime.now()

# Insert data into the table
for index in range(len(word)):
    try:
        cursor.execute(
            # "INSERT INTO langchain_pg_embedding (index, created_datetime) VALUES (%s, %s)",
            "INSERT INTO langchain_pg_embedding (created_datetime) VALUES (%s)",
            # (index, current_time)
            (current_time)
        )
    except Exception as e:
        print(f"Error inserting data: {e}")

In [None]:
# Commit and close connection
conn.commit()
cursor.close()
conn.close()

## Create vector database to store all our items within