In [121]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
from utils.logger import logger
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from vector_store import VectorStore
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
from langchain.text_splitter import CharacterTextSplitter
from langchain_chroma import Chroma
from config.config import SYSTEM_PROMPT
from langchain.document_loaders import DirectoryLoader, TextLoader
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain_core.callbacks import StdOutCallbackHandler

In [122]:
load_dotenv()
MODEL = os.getenv("OPENAI_MODEL")
VECTOR_DB_NAME = os.getenv("VECTOR_DB_NAME", "faiss_index")
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
folders = glob.glob("knowledge-base/*")



In [123]:
folders = glob.glob("knowledge-base/*")

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

# With thanks to CG and Jon R, students on the course, for this fix needed for some users 
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=400)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")



Total number of chunks: 17
Document types found: {'portfolio-projects', 'certifications', 'general', 'portfolio', 'technical-skills'}


In [124]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

embeddings = OpenAIEmbeddings()

# Delete if already exists

if os.path.exists(VECTOR_DB_NAME):
    Chroma(persist_directory=VECTOR_DB_NAME, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=VECTOR_DB_NAME)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

2025-05-02 14:33:45,991 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Vectorstore created with 17 documents


In [125]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 17 vectors with 1,536 dimensions in the vector store


In [126]:
# Prework (with thanks to Jon R for identifying and fixing a bug in this!)

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [['blue', 'green', 'red', 'orange',"purple"][['certifications', 'general', 'portfolio', 'portfolio-projects','technical-skills'].index(t)] for t in doc_types]

In [127]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, perplexity=(count-1) ,random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

## LangChain setup

In [128]:
system_prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(SYSTEM_PROMPT),
    MessagesPlaceholder(variable_name="chat_history"),
    HumanMessagePromptTemplate.from_template("{question}")
])

In [129]:

llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

retriever = vectorstore.as_retriever()

conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])

query = "What programming languages does Adam know?"
result = conversation_chain.invoke({"question": query})
answer = result["answer"]
print("\nAnswer:", answer)



[1m> Entering new ConversationalRetrievalChain chain...[0m


2025-05-02 14:33:46,982 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
## Roles Adam Is Open For
Here is a list of what roles Adam is open for (The roles can be focusing on AI or more general software development): 
1. Software Engineer
2. Software Developer
3. Backend Developer
4. Frontend Developer

## Locations Adam Is Open For
Adam is open to work on site if the workplace is within 1 hour public transport from Västerås, Sweden. Distances longer than that he is open to work remote.

### Working on-site:
- Enköping
- Stockholm County
- Uppsala County
- Västmanland County
- Örebro County

### Working remote:
- Sweden
- Norway
- Finland
- Denmark
- Island
- Europe

## Work Experience

## Professional Background
1. Currently seeking new oppor

2025-05-02 14:33:47,595 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

Answer: I don't know.


In [130]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG; k is how many chunks to use
retriever = vectorstore.as_retriever(search_kwargs={"k": 50})

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory,callbacks=[StdOutCallbackHandler()])

In [131]:
query = "What programming languages does Adam know?"
result = conversation_chain.invoke({"question": query})
answer = result["answer"]
print("\nAnswer:", answer)



[1m> Entering new ConversationalRetrievalChain chain...[0m


2025-05-02 14:33:48,463 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
## Roles Adam Is Open For
Here is a list of what roles Adam is open for (The roles can be focusing on AI or more general software development): 
1. Software Engineer
2. Software Developer
3. Backend Developer
4. Frontend Developer

## Locations Adam Is Open For
Adam is open to work on site if the workplace is within 1 hour public transport from Västerås, Sweden. Distances longer than that he is open to work remote.

### Working on-site:
- Enköping
- Stockholm County
- Uppsala County
- Västmanland County
- Örebro County

### Working remote:
- Sweden
- Norway
- Finland
- Denmark
- Island
- Europe

## Work Experience

## Professional Background
1. Currently seeking new oppor

2025-05-02 14:33:50,456 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

Answer: Adam knows the following programming languages:

1. Python
2. Golang
3. JavaScript/TypeScript (basic knowledge)

He is also familiar with web development technologies such as Flask and Django, and has experience with SQL, MySQL, and Postgres for database management.


In [132]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [133]:
view = gr.ChatInterface(chat).launch()


The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.

2025-05-02 14:33:50,700 - httpx - INFO - HTTP Request: GET http://127.0.0.1:7874/gradio_api/startup-events "HTTP/1.1 200 OK"
2025-05-02 14:33:50,703 - httpx - INFO - HTTP Request: HEAD http://127.0.0.1:7874/ "HTTP/1.1 200 OK"


* Running on local URL:  http://127.0.0.1:7874

To create a public link, set `share=True` in `launch()`.


2025-05-02 14:33:51,542 - httpx - INFO - HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"
