### 1: Dependencies

In [1]:
# Langchain dependencies
from langchain.document_loaders.pdf import PyPDFDirectoryLoader  # Importing PDF loader from Langchain
from langchain.document_loaders import CSVLoader  # Importing CSV loader from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Importing text splitter from Langchain
from langchain.embeddings import OpenAIEmbeddings  # Importing OpenAI embeddings from Langchain
from langchain.schema import Document  # Importing Document schema from Langchain
from langchain.vectorstores.chroma import Chroma  # Importing Chroma vector store from Langchain
from dotenv import load_dotenv # Importing dotenv to get API key from .env file
from langchain.chat_models import ChatOpenAI


import os  # Importing os module for operating system functionalities
import shutil  # Importing shutil module for high-level file operations

### 2: Read PDF

In [2]:
# Directory to your pdf files:
DATA_PATH = r"docs"

def load_documents():
    """
    Load PDF documents from the specified directory using PyPDFDirectoryLoader.

    Returns:
        List of Document objects: Loaded PDF documents represented as Langchain Document objects.
    """
    document_loader = CSVLoader(DATA_PATH + "/goodreads_reviews.csv")  # Initialize PDF loader with specified directory
    return document_loader.load()  # Load PDF documents and return them as a list of Document objects

In [3]:
documents = load_documents()
print(documents)

[Document(metadata={'source': 'docs/goodreads_reviews.csv', 'row': 0}, page_content='\ufeffreview_text: FROM HAN KANG, WINNER OF THE 2024 NOBEL PRIZE IN LITERATURE\n\n“[Han Kang writes in] intense poetic prose that . . . exposes the fragility of human life.”—from the Nobel Prize citation\n\nWINNER OF THE INTERNATIONAL BOOKER PRIZE • “Kang viscerally explores the limits of what a human brain and body can endure, and the strange beauty that can be found in even the most extreme forms of renunciation.”—Entertainment Weekly\n\nOne of the ’s 100 Best Books of the 21st Century\n\n“Ferocious.”— (Ten Best Books of the Year)\n“Both terrifying and terrific.”—Lauren Groff\n“Provocative [and] shocking.”—\n\nBefore the nightmares began, Yeong-hye and her husband lived an ordinary, controlled life. But the dreams—invasive images of blood and brutality—torture her, driving Yeong-hye to purge her mind and renounce eating meat altogether. It’s a small act of independence, but it interrupts her marriage

In [4]:
type(documents)

list

### 3: Split into chunks of text

Is this step necessary or useful for my application?

In [5]:
def split_text(documents: list[Document]):
    """
    Split the text content of the given list of Document objects into smaller chunks.

    Args:
        documents (list[Document]): List of Document objects containing text content to split.

    Returns:
        list[Document]: List of Document objects representing the split text chunks.
    """
    # Initialize text splitter with specified parameters
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,  # Size of each chunk in characters
        chunk_overlap=100,  # Overlap between consecutive chunks
        length_function=len,  # Function to compute the length of the text
        add_start_index=True,  # Flag to add start index to each chunk
    )
    # Split documents into smaller chunks using text splitter
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    # Print example of page content and metadata for a chunk
    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks  # Return the list of split text chunks

In [6]:
chunks = split_text(documents)

Split 32 documents into 287 chunks.
As a vegan I’ve experienced some of the things that I witnessed here. I can relate to it. I’ve lived it. I’ve been called a heretic. I’ve lost friends over it, and experienced much social awkwardness just for my lifestyle choice. I’m not pushy with my beliefs. Sometimes all it takes is a mention of the word “vegetarian” to be received with utter hostility or blankness. This review isn’t about the
{'source': 'docs/goodreads_reviews.csv', 'row': 2, 'start_index': 187}


In [7]:
for chunk in chunks:
    print(chunk)
    print("\n")

page_content='﻿review_text: FROM HAN KANG, WINNER OF THE 2024 NOBEL PRIZE IN LITERATURE

“[Han Kang writes in] intense poetic prose that . . . exposes the fragility of human life.”—from the Nobel Prize citation' metadata={'source': 'docs/goodreads_reviews.csv', 'row': 0, 'start_index': 0}


page_content='WINNER OF THE INTERNATIONAL BOOKER PRIZE • “Kang viscerally explores the limits of what a human brain and body can endure, and the strange beauty that can be found in even the most extreme forms of renunciation.”—Entertainment Weekly

One of the ’s 100 Best Books of the 21st Century' metadata={'source': 'docs/goodreads_reviews.csv', 'row': 0, 'start_index': 198}


page_content='One of the ’s 100 Best Books of the 21st Century

“Ferocious.”— (Ten Best Books of the Year)
“Both terrifying and terrific.”—Lauren Groff
“Provocative [and] shocking.”—' metadata={'source': 'docs/goodreads_reviews.csv', 'row': 0, 'start_index': 433}


page_content='Before the nightmares began, Yeong-hye and her 

### 4: Save to a RDB using Chroma

In [8]:
CHROMA_PATH = "chroma"

In [9]:
def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
    
    # print(chunks)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

### 5: Create a Chroma Database

In [10]:
def generate_data_store():
    """
    Function to generate vector database in chroma from documents.
    """
    documents = load_documents()  # Load documents from a source
    chunks = split_text(documents)  # Split documents into manageable chunks
    save_to_chroma(chunks)  # Save the processed data to a data store


In [11]:
# Load environment variables from a .env file
load_dotenv()
# Generate the data store
generate_data_store()

Split 32 documents into 287 chunks.
As a vegan I’ve experienced some of the things that I witnessed here. I can relate to it. I’ve lived it. I’ve been called a heretic. I’ve lost friends over it, and experienced much social awkwardness just for my lifestyle choice. I’m not pushy with my beliefs. Sometimes all it takes is a mention of the word “vegetarian” to be received with utter hostility or blankness. This review isn’t about the
{'source': 'docs/goodreads_reviews.csv', 'row': 2, 'start_index': 187}


  chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH


Saved 287 chunks to chroma.


  db.persist()


#### Embedding example

In [12]:
ex = "apple"
ex_1 = "orange"
ex_2 = "iphone"

In [13]:
embedding_function = OpenAIEmbeddings()
vector = embedding_function.embed_query(ex)
vector_1 = embedding_function.embed_query(ex_1)
vector_2 = embedding_function.embed_query(ex_2)

In [14]:
vector, len(vector)

([0.007730894059324105,
  -0.02313804706855683,
  -0.007587476431447153,
  -0.027809365687387712,
  -0.004650829521477468,
  0.013010029201724864,
  -0.02196338849083348,
  -0.008393346623632073,
  0.018958446297110524,
  -0.02955769477661887,
  -0.0029264030597849337,
  0.020078469765418393,
  -0.004415214680800583,
  0.009158240949416682,
  -0.021649234749049225,
  0.002014676411784905,
  0.030732353354342223,
  0.00010212104452982253,
  0.0020266278419695427,
  -0.025460046669295792,
  -0.02106190546018755,
  -0.008195294352000941,
  0.0213760592019718,
  -0.012552459229047571,
  0.001133682362077065,
  0.005043520767385176,
  0.010196311753601323,
  7.81647495259728e-05,
  0.016062776184863757,
  -0.013023687979078734,
  0.020460917393972003,
  -0.016158387626340857,
  -0.01838477578560272,
  0.00544304140196982,
  -0.019381870257725746,
  -0.009171899726770554,
  -0.01203342382695525,
  -0.008707500365416326,
  -0.005702558637354677,
  -0.006166958930031512,
  0.010524123341416839

In [15]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator("pairwise_embedding_distance")

In [16]:
# run an evaluation

x = evaluator.evaluate_string_pairs(prediction=ex, prediction_b=ex_1)

In [17]:
x

{'score': 0.13554126333631622}

In [18]:
evaluator.evaluate_string_pairs(prediction=ex, prediction_b=ex_2)

{'score': 0.09709082173706307}

In [19]:
evaluator.evaluate_string_pairs(prediction=ex, prediction_b=ex)

{'score': -2.220446049250313e-16}

Bigger distance = strings are more different

### 6: Query vector database for relevant data

In [41]:
import pandas as pd

df = pd.read_csv("docs/test_data.csv")
titles = list(df['title'].values)
titles = ['채식주의자']

In [50]:
query_text =  "이 소설에 대한 논점"
print(titles[0])

채식주의자


In [73]:
PROMPT_TEMPLATE = """
{context}를 기반으로 답변하되 다양한 관점으로 한글로 답변하십시오.

---

질문: {question}
답변:
"""


In [82]:
# Use same embedding function as before
embedding_function = OpenAIEmbeddings()
 
# Prepare the database
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

# Search the DB.
results = db.similarity_search_with_relevance_scores(query_text, k=20)
if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")

In [83]:
from langchain.prompts import ChatPromptTemplate

In [84]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
print(prompt)

Human: 
﻿review_text: Librarian Note: There is more than one author by this name in the Goodreads database.

소설가 한강

---

As a professional writer and a professor of creative writing at Seoul Arts University, Han Kang, however, takes her writing seriously: her psychological descriptions are extremely delicate, showcasing the strengths of a female writer; and her writing perspective is unique.

---

***
What a better moment to read this than in Seoul while eating lots of Korean meat.
page: 1

---

﻿review_text: In my country, compared to the vast expanse dedicated to Japanese literature in bookstores, the space allocated to Korean literature is pitifully small. Moreover, translations of serious literary works are scarce, making it difficult to understand Korean society through such works. So, my impression of Korea is largely shaped by the various exaggerations and melodramatic plots of

---

소설가 한강

Han Kang was born in 1970 in South Korea. She is the author of The Vegetarian, winner o

In [85]:
model = ChatOpenAI()
response_text = model.predict(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Response: 이 소설에 대한 논점은 여러 가지가 있을 수 있습니다. 먼저, 이 소설은 한국 사회의 여성에 대한 억압과 불평등 문제를 다루고 있습니다. 또한, 주인공인 영혜의 선택이 주변 사람들에게 미치는 영향과 그녀의 인간적 삶의 취약성을 드러내고 있습니다. 또한, 작가 한강은 현대 사람들이 느끼는 깊은 절망에 대해 다루고 있으며, 이를 통해 현대 사회의 압박, 직장의 강한 제한, 가족 갈등의 고통 등을 다루고 있습니다. 이러한 논점들을 통해 이 소설은 한국 사회뿐만 아니라 세계적으로도 사회적 문제와 개인의 삶에 대한 비판을 제기하고 있습니다.
Sources: ['docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv', 'docs/goodreads_reviews.csv']


In [86]:
response_text

'이 소설에 대한 논점은 여러 가지가 있을 수 있습니다. 먼저, 이 소설은 한국 사회의 여성에 대한 억압과 불평등 문제를 다루고 있습니다. 또한, 주인공인 영혜의 선택이 주변 사람들에게 미치는 영향과 그녀의 인간적 삶의 취약성을 드러내고 있습니다. 또한, 작가 한강은 현대 사람들이 느끼는 깊은 절망에 대해 다루고 있으며, 이를 통해 현대 사회의 압박, 직장의 강한 제한, 가족 갈등의 고통 등을 다루고 있습니다. 이러한 논점들을 통해 이 소설은 한국 사회뿐만 아니라 세계적으로도 사회적 문제와 개인의 삶에 대한 비판을 제기하고 있습니다.'

In [79]:
def query_rag(query_text):
    """
    Query a Retrieval-Augmented Generation (RAG) system using Chroma database and OpenAI.

    Args:
    - query_text (str): The text to query the RAG system with.

    Returns:
    - formatted_response (str): Formatted response including the generated text and sources.
    - response_text (str): The generated response text.
    """
    # Use same embedding function as before
    embedding_function = OpenAIEmbeddings()
    
    # Prepare the database
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_relevance_scores(query_text, k=3)
    
    # Check if there are any matching results or if the relevance score is too low
    if len(results) == 0 or results[0][1] < 0.7:
        print(f"Unable to find matching results.")

    # Combine context from matching documents
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    
    # Create prompt template using context and query text
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)

    # Initialize OpenAI chat model
    model = ChatOpenAI()
    
    # Generate response text based on the prompt
    response_text = model.predict(prompt)

    # Get sources of the matching documents
    sources = [doc.metadata.get("source", None) for doc, _score in results]
    
    # Format and return response including generated text and sources
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    return formatted_response, response_text

In [80]:
formatted_response, response_text = query_rag(query_text)

In [81]:
response_text

'이 소설은 한강의 섬세한 심리 묘사와 독특한 시각을 보여주며 여성 작가의 강점을 잘 드러내고 있습니다. 또한 서울 예술대학교 창작문학 교수로서 전문 작가로서 그녀는 자신의 글쓰기를 진지하게 대합니다. 이 소설은 서울에서 한국 고기를 많이 먹으며 읽기에 더할 나위 없는 순간입니다.'