### Cosine Similarity


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
def calculate_cosine_similarity(text1, text2):
    vectorizer = TfidfVectorizer(
        stop_words='english',
        use_idf=True,
        norm='l2',
        ngram_range=(1, 2),
        sublinear_tf=True,
        analyzer='word'
    )
    tfidf = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(tfidf[0:1], tfidf[1:2])
    return similarity[0][0]

### Enhanced Similarity


In [4]:
import nltk
import spacy
import numpy as np

from nltk.corpus import wordnet
from collections import Counter

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# spaCy 모델을 적재한다.
nlp = spacy.load("en_core_web_sm")

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

def preprocess_text(text):
    doc = nlp(text.lower())
    lemmatized_words = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        lemmatized_words.append(token.lemma_)
    return lemmatized_words

def expand_with_synonyms(words):
    expanded_words = words.copy()
    for word in words:
        expanded_words.extend(get_synonyms(word))
    return expanded_words

def calculate_enhanced_similarity(text1, text2):
    words1 = preprocess_text(text1)
    words2 = preprocess_text(text2)

    words1_expanded = expand_with_synonyms(words1)
    words2_expanded = expand_with_synonyms(words2)

    freq1 = Counter(words1_expanded)
    freq2 = Counter(words2_expanded)

    unique_words = set(freq1.keys()).union(set(freq2.keys()))
    
    vector1 = [freq1[word] for word in unique_words]
    vector2 = [freq2[word] for word in unique_words]

    vector1 = np.array(vector1)
    vector2 = np.array(vector2)
    
    cosine_similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
    return cosine_similarity

## Naive RAG

In [13]:
import textwrap

def print_formatted_response(response):
    wrapper = textwrap.TextWrapper(width=150)
    wrapped_text = wrapper.fill(text=response)
    print("Response:")
    print("-" * 20)
    print(wrapped_text)
    print("-" * 20 + "\n")

In [7]:
def find_best_match_keyword_search(query, db_records):
    best_score = 0
    best_record = None

    query_keywords = set(query.lower().split())
    for record in db_records:
        record_keywords = set(record.lower().split())
        common_keywords = query_keywords.intersection(record_keywords)
        current_score = len(common_keywords)
        if current_score > best_score:
            best_score = current_score
            best_record = record
    
    return best_score, best_record

In [8]:
db_records = [
    "Retrieval Augmented Generation (RAG) represents a sophisticated hybrid approach in the field of artificial intelligence, particularly within the realm of natural language processing (NLP).",
    "It innovatively combines the capabilities of neural network-based language models with retrieval systems to enhance the generation of text, making it more accurate, informative, and contextually relevant.",
    "This methodology leverages the strengths of both generative and retrieval architectures to tackle complex tasks that require not only linguistic fluency but also factual correctness and depth of knowledge.",
    "At the core of Retrieval Augmented Generation (RAG) is a generative model, typically a transformer-based neural network, similar to those used in models like GPT (Generative Pre-trained Transformer) or BERT (Bidirectional Encoder Representations from Transformers).",
    "This component is responsible for producing coherent and contextually appropriate language outputs based on a mixture of input prompts and additional information fetched by the retrieval component.",
    "Complementing the language model is the retrieval system, which is usually built on a database of documents or a corpus of texts.",
    "This system uses techniques from information retrieval to find and fetch documents that are relevant to the input query or prompt.",
    "The mechanism of relevance determination can range from simple keyword matching to more complex semantic search algorithms which interpret the meaning behind the query to find the best matches.",
    "This component merges the outputs from the language model and the retrieval system.",
    "It effectively synthesizes the raw data fetched by the retrieval system into the generative process of the language model.",
    "The integrator ensures that the information from the retrieval system is seamlessly incorporated into the final text output, enhancing the model's ability to generate responses that are not only fluent and grammatically correct but also rich in factual details and context-specific nuances.",
    "When a query or prompt is received, the system first processes it to understand the requirement or the context.",
    "Based on the processed query, the retrieval system searches through its database to find relevant documents or information snippets.",
    "This retrieval is guided by the similarity of content in the documents to the query, which can be determined through various techniques like vector embeddings or semantic similarity measures.",
    "The retrieved documents are then fed into the language model.",
    "In some implementations, this integration happens at the token level, where the model can access and incorporate specific pieces of information from the retrieved texts dynamically as it generates each part of the response.",
    "The language model, now augmented with direct access to retrieved information, generates a response.",
    "This response is not only influenced by the training of the model but also by the specific facts and details contained in the retrieved documents, making it more tailored and accurate.",
    "By directly incorporating information from external sources, Retrieval Augmented Generation (RAG) models can produce responses that are more factual and relevant to the given query.",
    "This is particularly useful in domains like medical advice, technical support, and other areas where precision and up-to-date knowledge are crucial.",
    "Retrieval Augmented Generation (RAG) systems can dynamically adapt to new information since they retrieve data in real-time from their databases.",
    "This allows them to remain current with the latest knowledge and trends without needing frequent retraining.",
    "With access to a wide range of documents, Retrieval Augmented Generation (RAG) systems can provide detailed and nuanced answers that a standalone language model might not be capable of generating based solely on its pre-trained knowledge.",
    "While Retrieval Augmented Generation (RAG) offers substantial benefits, it also comes with its challenges.",
    "These include the complexity of integrating retrieval and generation systems, the computational overhead associated with real-time data retrieval, and the need for maintaining a large, up-to-date, and high-quality database of retrievable texts.",
    "Furthermore, ensuring the relevance and accuracy of the retrieved information remains a significant challenge, as does managing the potential for introducing biases or errors from the external sources.",
    "In summary, Retrieval Augmented Generation represents a significant advancement in the field of artificial intelligence, merging the best of retrieval-based and generative technologies to create systems that not only understand and generate natural language but also deeply comprehend and utilize the vast amounts of information available in textual form.",
    "A RAG vector store is a database or dataset that contains vectorized data points."
]

In [9]:
query = "Define a RAG store"

best_keyword_score, best_keyword_record = find_best_match_keyword_search(query, db_records)
print(f"Best keyword Score: {best_keyword_score}")
print_formatted_response(best_keyword_record)

Best keyword Score: 3
Response:
--------------------
A RAG vector store is a database or dataset that contains vectorized data points.
--------------------



In [10]:
score = calculate_cosine_similarity(query, best_keyword_record)
print(f"Best Cosine Similarity Score: {score:.3f}")

Best Cosine Similarity Score: 0.126


In [11]:
response = best_keyword_record
print(query, ": ", response)
similarity_score = calculate_enhanced_similarity(query, response)
print(f"Enhanced Similarity Score: {similarity_score:.3f}")

Define a RAG store :  A RAG vector store is a database or dataset that contains vectorized data points.
Enhanced Similarity Score: 0.642


In [14]:
augmented_input = query + ": " + best_keyword_record
print_formatted_response(augmented_input)

Response:
--------------------
Define a RAG store: A RAG vector store is a database or dataset that contains vectorized data points.
--------------------



In [17]:
import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

In [18]:
import time
from openai import OpenAI

client = OpenAI()
gptmodel = "gpt-4o"

start_time = time.time()

In [19]:
def call_llm_with_full_text(itext):
    text_input = '\n'.join(itext)
    prompt = f"""
    Please elaborate on the following content and translate the result in "KOREAN":\n{text_input}
    """
    try:
        response = client.chat.completions.create(
            model=gptmodel,
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert Natural Language Processing exercise expert. "
                },
                {
                    "role": "assistant",
                    "content": "1. You can explain read the input and answer in detail"
                },
                {
                    "role": "user",
                    "content": prompt
                },
            ],
            temperature=0.1
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return str(e)

In [20]:
llm_response = call_llm_with_full_text(augmented_input)
print_formatted_response(llm_response)

Response:
--------------------
A RAG (Retrieval-Augmented Generation) vector store is a specialized database or dataset designed to store and manage vectorized data points. These
data points are typically numerical representations of information, such as text, images, or other types of data, that have been transformed into
vectors. The purpose of a RAG vector store is to facilitate efficient retrieval and manipulation of these vectorized data points, often used in
machine learning and artificial intelligence applications to enhance the performance of models by providing relevant context or information during the
generation process.  Korean Translation: RAG 벡터 저장소는 벡터화된 데이터 포인트를 저장하고 관리하기 위해 설계된 특수한 데이터베이스 또는 데이터셋입니다. 이러한 데이터 포인트는 일반적으로 텍스트, 이미지 또는 기타 유형의 데이터를
벡터로 변환한 수치적 표현입니다. RAG 벡터 저장소의 목적은 이러한 벡터화된 데이터 포인트의 효율적인 검색과 조작을 용이하게 하는 것으로, 주로 기계 학습 및 인공지능 응용 프로그램에서 모델의 성능을 향상시키기 위해 관련된 컨텍스트나 정보를 제공하는 데 사용됩니다.
--------------------



## Advanced RAG

In [21]:
def find_best_match(text_input, records):
    best_score = 0
    best_record = None
    for record in records:
        current_score = calculate_cosine_similarity(text_input, record)
        if current_score > best_score:
            best_score = current_score
            best_record = record
    return best_score, best_record

In [22]:
best_similarity_score, best_similarity_record = find_best_match(query, db_records)
print_formatted_response(best_similarity_record)

Response:
--------------------
A RAG vector store is a database or dataset that contains vectorized data points.
--------------------



In [23]:
print(f"Best Cosine Similarity Score: {best_similarity_score:.3f}")

Best Cosine Similarity Score: 0.126


In [24]:
response = best_similarity_record
print(query, ": ", response)
similarity_score = calculate_enhanced_similarity(query, best_similarity_record)
print(f"Enhanced Similarity: {similarity_score:.3f}")

Define a RAG store :  A RAG vector store is a database or dataset that contains vectorized data points.
Enhanced Similarity: 0.642


In [25]:
augmented_input = query + ": " + best_similarity_record
print_formatted_response(augmented_input)

Response:
--------------------
Define a RAG store: A RAG vector store is a database or dataset that contains vectorized data points.
--------------------



In [26]:
llm_response = call_llm_with_full_text(augmented_input)
print_formatted_response(llm_response)

Response:
--------------------
A RAG (Retrieval-Augmented Generation) vector store is a specialized database or dataset designed to store and manage vectorized data points. These
data points are typically numerical representations of information, often derived from text or other forms of data, that have been transformed into
vectors. The purpose of a RAG vector store is to facilitate efficient retrieval and manipulation of these vectors, which can be used in various
applications such as machine learning, natural language processing, and information retrieval. By storing data in vector form, it becomes easier to
perform operations like similarity searches, clustering, and classification, which are essential for tasks that involve understanding and generating
human-like text or other complex data interactions.  Korean Translation: RAG(검색 증강 생성) 벡터 저장소는 벡터화된 데이터 포인트를 저장하고 관리하기 위해 설계된 특수한 데이터베이스 또는 데이터셋입니다.
이러한 데이터 포인트는 일반적으로 텍스트 또는 기타 형태의 데이터에서 파생된 정보를 수치적으로 표현한 것으로, 벡터로 변환된 것입니다. RAG 벡터 

## Index-based search

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def setup_vectorizer(records):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(records)
    return vectorizer, tfidf_matrix

def find_best_match(query, vectorizer, tfidf_matrix):
    query_tfidf = vectorizer.transform([query])
    similarities = cosine_similarity(query_tfidf, tfidf_matrix)
    best_index = similarities.argmax()
    best_score = similarities[0, best_index]
    return best_score, best_index

In [30]:
vectorizer, tfidf_matrix = setup_vectorizer(db_records)
best_similarity_score, best_index = find_best_match(query, vectorizer, tfidf_matrix)
best_matching_record = db_records[best_index]

print_formatted_response(best_matching_record)

Response:
--------------------
A RAG vector store is a database or dataset that contains vectorized data points.
--------------------



In [36]:
import pandas as pd

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,ability,access,accuracy,accurate,adapt,additional,advancement,advice,algorithms,allows,...,vector,vectorized,when,where,which,while,wide,with,within,without
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.260582,0.0
1,0.0,0.0,0.0,0.216364,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.160278,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.236479,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
augmented_input = query + ": " + best_matching_record
print_formatted_response(augmented_input)

Response:
--------------------
Define a RAG store: A RAG vector store is a database or dataset that contains vectorized data points.
--------------------



## Modular RAG

In [39]:
class RetrievalComponent:

    def __init__(self, method='vector'):
        self.method = method
        if self.method == 'vector' or self.method == 'indexed':
            self.vectorizer = TfidfVectorizer()
            self.tfidf_matrix = None
    
    def fit(self, records):
        self.documents = records
        if self.method == 'vector' or self.method == 'indexed':
            self.tfidf_matrix = self.vectorizer.fit_transform(records)
    
    def retrieve(self, query):
        if self.method == 'keyword':
            return self.keyword_search(query)
        elif self.method == 'vector':
            return self.vector_search(query)
        elif self.method == 'indexed':
            return self.indexed_search(query)
    
    def keyword_search(self, query):
        best_score = 0
        best_record = None
        query_keywords = set(query.lower().split())
        for index, doc in enumerate(self.documents):
            doc_keywords = set(doc.lower().split())
            common_keywords = query_keywords.intersection(doc_keywords)
            score = len(common_keywords)
            if score > best_score:
                best_score = score
                best_record = self.documents[index]
        return best_record
    
    def vector_search(self, query):
        query_tfidf = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_tfidf, self.tfidf_matrix)
        best_index = similarities.argmax()
        return db_records[best_index]
    
    def indexed_search(self, query):
        query_tfidf = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_tfidf, self.tfidf_matrix)
        best_index = similarities.argmax()
        return db_records[best_index]

In [40]:
retrieval = RetrievalComponent(method='vector')
retrieval.fit(db_records)

best_matching_record = retrieval.retrieve(query)
print_formatted_response(best_matching_record)

Response:
--------------------
A RAG vector store is a database or dataset that contains vectorized data points.
--------------------



### 