In [24]:
#must have python 3.10 installed
!pip install pandas langchain-openai langchain-core langchain-community langchain pypdf chromadb rank-bm25 cohere

Collecting cohere
  Downloading cohere-4.52-py3-none-any.whl.metadata (6.0 kB)
Collecting fastavro<2.0,>=1.8 (from cohere)
  Using cached fastavro-1.9.4-cp310-cp310-win_amd64.whl.metadata (5.7 kB)
Downloading cohere-4.52-py3-none-any.whl (52 kB)
   ---------------------------------------- 0.0/52.0 kB ? eta -:--:--
   ---------------------------------------  51.2/52.0 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 52.0/52.0 kB 889.0 kB/s eta 0:00:00
Using cached fastavro-1.9.4-cp310-cp310-win_amd64.whl (497 kB)
Installing collected packages: fastavro, cohere
Successfully installed cohere-4.52 fastavro-1.9.4


In [10]:
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.retrievers import BM25Retriever
from langchain.retrievers.merger_retriever import MergerRetriever
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.chat import MessagesPlaceholder

In [26]:
import os

my_openai_api_key = "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"
os.environ["COHERE_API_KEY"] = "fgNPrLh4cGje0EBy7GL6P4pHR6NbNkjobUgsFcIF"

# Data Loading and Vectorization

In [12]:
file_path = r"Compilers Overview.txt"
data = []
with open(file_path, 'r') as file:
    for line in file:
        if line != '\n':
            data.append(line)


In [13]:
from langchain_community.document_loaders.pdf import PyPDFLoader

file_paths = [
    "./CSC 4351 Midterm Notes.pdf",
    "./overview.pdf",
    "./lexical.pdf",
    "./parsing.pdf"
]
page_contents = []
for file_path in file_paths:
    pdf_loader = PyPDFLoader(file_path)
    docs = pdf_loader.load()
    page_contents += [doc.page_content for doc in docs]


In [15]:
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=100, separators=['.', '\n'])

# Function to prepare documents with metadata
def prepare_documents_with_metadata(data, page_contents):
    """
    Prepare a list of documents with metadata from a list of articles.
    :param articles: A list of articles in JSON format.
    :return: A list of Document objects.
    """
    
    prepared_docs = []
    title = ""
    for line in data:
        # Extract necessary fields from each article
        if ":" in line:
            title = line
        else:
            content = line
            # Create a Document object with metadata
            doc = Document(page_content=content, metadata={"title": title})
            prepared_docs.append(doc)
    for page in page_contents:
        doc = Document(page_content=page, metadata={"title": "PDF Page"})
        prepared_docs.append(doc)
    
    return prepared_docs

#json_data = json.loads(data_text)
#print(len(json_data['rows']))
docs = prepare_documents_with_metadata(data, page_contents) 

#docs = text_splitter.split_documents(docs)
print(docs[0])

page_content="as an introduction to compilers let's have a look at the compiler architecture in general so if you look at a tool like say gcc the new c compiler it is not just a compiler it's it cons it drives multiple different tools so first of all the source code is not actual c source code instead it is c source code together with preprocessor macros like pound include pound define etc which could be called maybe skeletal source code and this is fed to the c preprocessor which then produces the actual source code now comes uh comes the compiler which translates the source code not to an object file not to an executable but to assembly text why because we already have an assembler in our system we already have uh other tools that can uh take it from the assembly text file all the way to an executable it is much easier to to generate assembly text than to generate an executable the assembler then assembles the code and produces a relocatable machine code relocatable means that the as

In [16]:
print(docs[10].metadata)

{'title': 'NFA to DFA Example: https://youtu.be/lbAY6uqq9fs\n'}


## Defining Semantic & Lexical Retrievers

In [22]:
# Using the OpenAI embedding model to create vector embeddings for each chunk
embeddings = OpenAIEmbeddings(openai_api_key=my_openai_api_key)
# Storing chunks along with their vector embeddings into a Chroma database
db = Chroma.from_documents(docs, embeddings)
# Defining our semantic retriever, which will return the top-7 most semantically relevant chunks
semantic_retriever = db.as_retriever(k=7)
# Defining our lexical retriever, which uses the BM25 algorithm, to retrieve the top-7 most
# lexically similar chunks
bm25_retriever = BM25Retriever.from_documents(docs, k=7)
# Merge retrievers together into a single retriever, which will return up to 10 chunks
merged_retriever = MergerRetriever(retrievers=[semantic_retriever, bm25_retriever])

## Reranker and Final Retriever

In [27]:
# We are using Cohere Rerank as our compression algorithm
compressor = CohereRerank( top_n=5)
# We define a new retriever than first uses the base_retriever to retrieve documents and then the
# base_compressor to filter them
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=merged_retriever)

# LLM & Chain Setup

In [29]:
# ChatGPT
llm = ChatOpenAI(openai_api_key=my_openai_api_key, model="gpt-3.5-turbo", temperature=0.0)

## Defining First Chain
### This chain's job is to take a question and a chat history and create a version of the question that is contextualized with the chat history

In [30]:
# Define a prompt to contextualize the user's question

contextualize_system_prompt = "Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is."
contextualize_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)
contextualize_chain = contextualize_prompt | llm | StrOutputParser()

## Let's Test the Contextualization Chain

## Creating the RAG Template

In [31]:
# Create our prompt
rag_template = """
{context}
You are an advisor for a student learning about compilers specifically lexical analysis and parsing. 
Use any information from the context to anwser the question. Site where you find the knowledge. If you can't find information from the document, use your own knowledge to answer the question.
When needed use ascii art to help explain concepts and draw things like parse trees or tables.
Question: {question}
"""
rag_prompt = ChatPromptTemplate.from_template(rag_template)

## Creating Main Chain
### This chain needs to be able to dynamically determine if the question needs to be contextualized (which is not the case when there is no chat history)

In [32]:
# itemgetter is built into Python and allows you to create a function that
# returns the value of a key
from operator import itemgetter
# Create the chain
from langchain_core.output_parsers import StrOutputParser

chain = (
    RunnablePassthrough()
    | RunnablePassthrough().assign(context = itemgetter("question") | compression_retriever)
    | rag_prompt
    | llm
    | StrOutputParser()
)

In [33]:
import tkinter as tk
from tkinter import messagebox
from tkinter import scrolledtext

# Create the main window
window = tk.Tk()
window.title("Question Popup")

# Improved color scheme with an accent color
dark_background = '#333333'  # Slightly lighter dark background for variety
text_color = 'white'
accent_color = '#4CAF50'  # A green accent color for interactive elements

# Use modern fonts
font_family = 'Arial'
base_font_size = 12

# Set the color scheme to dark mode with improved aesthetics
window.configure(bg=dark_background)

# Create a scrolled text widget for the chat history with improved visuals
chat_history = scrolledtext.ScrolledText(window, width=70, height=10, bg=dark_background, fg=text_color, font=(font_family, base_font_size))
chat_history.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)  # Added padding

# Create a label and entry for the question with refined styling
question_label = tk.Label(window, text="Enter your question:", bg=dark_background, fg=text_color, font=(font_family, base_font_size))
question_label.pack(pady=(10, 0))  # Added vertical padding for spacing
question_entry = tk.Entry(window, width=100, bg=dark_background, fg=text_color, insertbackground=text_color)  # Ensure cursor is visible
question_entry.pack(pady=(0, 10))  # Added vertical padding for spacing

def get_question(event=None):
    question = question_entry.get()
    if question.strip() == "":
        messagebox.showerror("Error", "Please enter a question.")
        return
    chat_history.insert(tk.END, f"\nUser: {question}\n")
    # Simulate response for demonstration
    response = chain.invoke({"question": question})
    chat_history.insert(tk.END, f"AI: {response}\n")
    chat_history.see(tk.END)
    question_entry.delete(0, tk.END)

# Create a button to submit the question with improved design
submit_button = tk.Button(window, text="Submit", command=get_question, bg=accent_color, fg='white', font=(font_family, base_font_size), bd=0, padx=10, pady=5)
submit_button.pack()

# Bind the Enter key to submit the question
window.bind('<Return>', get_question)

# Run the main event loop
window.mainloop()
