In [3]:
import os
import openai
from openai import OpenAI
import time
import textwrap

In [8]:
#!pip install openai==1.40.3

In [4]:
with open('api_key.txt','r') as key:
    API_KEY = key.readline().strip()

os.environ['OPENAI_API_KEY'] = API_KEY # we define the env varibale to be accessible for all scripts not just the currently one.
openai.api_key = os.getenv('OPENAI_API_KEY')

In [5]:
client = OpenAI()
gptmodel='gpt-4o'
start_time = time.time()

In [6]:
# function which creates a prompt
def call_llm_with_full_txt(txt):
    txt_input= textwrap.fill(txt,width=30)
    prompt = f"Please elaborate on the following content:\n{txt_input}"
    try:
        response = client.chat.completions.create(
            model= gptmodel,
            messages=[{'role':'system', 'content':"You are an expert Natural Language Processing exercise expert."},
                  {"role": "assistant", "content": "1.You can explain read the input and answer in detail"},
                  {"role": "user", "content": prompt}],
            temperature=0.1)
        return response.choices[0].message.content.strip()
    except Exception as e :
        return str(e)

In [7]:
def print_formatted_response(response):
    wrapper = textwrap.TextWrapper(width=80)
    wrap_res = wrapper.fill(text=response)
    print("Response:")
    print("---------")
    print(wrap_res)
    print("-------\n")

In [8]:
db_records = [
    "Retrieval Augmented Generation (RAG) represents a sophisticated hybrid approach in the field of artificial intelligence, particularly within the realm of natural language processing (NLP).",
    "It innovatively combines the capabilities of neural network-based language models with retrieval systems to enhance the generation of text, making it more accurate, informative, and contextually relevant.",
    "This methodology leverages the strengths of both generative and retrieval architectures to tackle complex tasks that require not only linguistic fluency but also factual correctness and depth of knowledge.",
    "At the core of Retrieval Augmented Generation (RAG) is a generative model, typically a transformer-based neural network, similar to those used in models like GPT (Generative Pre-trained Transformer) or BERT (Bidirectional Encoder Representations from Transformers).",
    "This component is responsible for producing coherent and contextually appropriate language outputs based on a mixture of input prompts and additional information fetched by the retrieval component.",
    "Complementing the language model is the retrieval system, which is usually built on a database of documents or a corpus of texts.",
    "This system uses techniques from information retrieval to find and fetch documents that are relevant to the input query or prompt.",
    "The mechanism of relevance determination can range from simple keyword matching to more complex semantic search algorithms which interpret the meaning behind the query to find the best matches.",
    "This component merges the outputs from the language model and the retrieval system.",
    "It effectively synthesizes the raw data fetched by the retrieval system into the generative process of the language model.",
    "The integrator ensures that the information from the retrieval system is seamlessly incorporated into the final text output, enhancing the model's ability to generate responses that are not only fluent and grammatically correct but also rich in factual details and context-specific nuances.",
    "When a query or prompt is received, the system first processes it to understand the requirement or the context.",
    "Based on the processed query, the retrieval system searches through its database to find relevant documents or information snippets.",
    "This retrieval is guided by the similarity of content in the documents to the query, which can be determined through various techniques like vector embeddings or semantic similarity measures.",
    "The retrieved documents are then fed into the language model.",
    "In some implementations, this integration happens at the token level, where the model can access and incorporate specific pieces of information from the retrieved texts dynamically as it generates each part of the response.",
    "The language model, now augmented with direct access to retrieved information, generates a response.",
    "This response is not only influenced by the training of the model but also by the specific facts and details contained in the retrieved documents, making it more tailored and accurate.",
    "By directly incorporating information from external sources, Retrieval Augmented Generation (RAG) models can produce responses that are more factual and relevant to the given query.",
    "This is particularly useful in domains like medical advice, technical support, and other areas where precision and up-to-date knowledge are crucial.",
    "Retrieval Augmented Generation (RAG) systems can dynamically adapt to new information since they retrieve data in real-time from their databases.",
    "This allows them to remain current with the latest knowledge and trends without needing frequent retraining.",
    "With access to a wide range of documents, Retrieval Augmented Generation (RAG) systems can provide detailed and nuanced answers that a standalone language model might not be capable of generating based solely on its pre-trained knowledge.",
    "While Retrieval Augmented Generation (RAG) offers substantial benefits, it also comes with its challenges.",
    "These include the complexity of integrating retrieval and generation systems, the computational overhead associated with real-time data retrieval, and the need for maintaining a large, up-to-date, and high-quality database of retrievable texts.",
    "Furthermore, ensuring the relevance and accuracy of the retrieved information remains a significant challenge, as does managing the potential for introducing biases or errors from the external sources.",
    "In summary, Retrieval Augmented Generation represents a significant advancement in the field of artificial intelligence, merging the best of retrieval-based and generative technologies to create systems that not only understand and generate natural language but also deeply comprehend and utilize the vast amounts of information available in textual form.",
    "A RAG vector store is a database or dataset that contains vectorized data points."
]

In [9]:
db_records_join = ''.join(db_records)

In [10]:
db_records_join

"Retrieval Augmented Generation (RAG) represents a sophisticated hybrid approach in the field of artificial intelligence, particularly within the realm of natural language processing (NLP).It innovatively combines the capabilities of neural network-based language models with retrieval systems to enhance the generation of text, making it more accurate, informative, and contextually relevant.This methodology leverages the strengths of both generative and retrieval architectures to tackle complex tasks that require not only linguistic fluency but also factual correctness and depth of knowledge.At the core of Retrieval Augmented Generation (RAG) is a generative model, typically a transformer-based neural network, similar to those used in models like GPT (Generative Pre-trained Transformer) or BERT (Bidirectional Encoder Representations from Transformers).This component is responsible for producing coherent and contextually appropriate language outputs based on a mixture of input prompts an

In [11]:
wraped_db_records = textwrap.fill(db_records_join, width=30)

In [12]:
wraped_db_records

"Retrieval Augmented Generation\n(RAG) represents a\nsophisticated hybrid approach\nin the field of artificial\nintelligence, particularly\nwithin the realm of natural\nlanguage processing (NLP).It\ninnovatively combines the\ncapabilities of neural\nnetwork-based language models\nwith retrieval systems to\nenhance the generation of\ntext, making it more accurate,\ninformative, and contextually\nrelevant.This methodology\nleverages the strengths of\nboth generative and retrieval\narchitectures to tackle\ncomplex tasks that require not\nonly linguistic fluency but\nalso factual correctness and\ndepth of knowledge.At the core\nof Retrieval Augmented\nGeneration (RAG) is a\ngenerative model, typically a\ntransformer-based neural\nnetwork, similar to those used\nin models like GPT (Generative\nPre-trained Transformer) or\nBERT (Bidirectional Encoder\nRepresentations from\nTransformers).This component\nis responsible for producing\ncoherent and contextually\nappropriate language outputs\nbas

In [13]:
query = 'define a rag store'

In [14]:
llm_response = call_llm_with_full_txt(query)
print_formatted_response(llm_response)

Response:
---------
A "rag store" typically refers to a shop or business that deals in the buying
and selling of rags or used clothing. Historically, rag stores were places where
people could sell their old clothes or textiles, which would then be sorted,
cleaned, and either resold as second-hand clothing or repurposed for other uses.
These stores played a significant role in the recycling and reuse of textiles
before the advent of modern recycling systems.  In more detail, a rag store
might operate in the following ways:  1. **Collection**: The store collects used
clothing and textiles from individuals, donations, or other sources. This could
include everything from everyday clothing to linens and other fabric items.  2.
**Sorting**: Once collected, the items are sorted based on their condition,
type, and potential for resale or repurposing. Clothing in good condition might
be cleaned and sold as second-hand apparel, while items that are too worn might
be categorized as rags.  3. **Re

### Retrieval metrics:

## Cosine Similarity:

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def calculate_cosine_similarity(text1,text2):
    vectorizer = TfidfVectorizer(stop_words='english', # ignores common english words
                                 use_idf=True, # enables idf weighting
                                 norm = 'l2', # applies L2 normalization to each output vector
                                 ngram_range=(1,2), # consider both single and two-word combinations
                                 sublinear_tf= True, # applies log term frequency scaling (ex. replace tf with 1 + log(tf).)
                                 analyzer='word' # analyzes text at the word level
                                )
    tfidf = vectorizer.fit_transform([text1,text2]) # converts the query and record to vectors.
    similarity = cosine_similarity(tfidf[0:1], tfidf[1:2])
    return similarity[0][0]

## Enhanced Similarity:

In [15]:
pip install "numpy<2" --no-build-isolation

Note: you may need to restart the kernel to use updated packages.


In [16]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [16]:

import spacy
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
from collections import Counter
import numpy as np

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

def preprocess_text(text):
    doc = nlp(text.lower())
    lemmatized_words = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        lemmatized_words.append(token.lemma_)
    return lemmatized_words

def expand_with_synonyms(words):
    expanded_words = words.copy()
    for word in words:
        expanded_words.extend(get_synonyms(word))
    return expanded_words

def calculate_enhanced_similarity(text1, text2):
    # Preprocess and tokenize texts
    words1 = preprocess_text(text1)
    words2 = preprocess_text(text2)

    # Expand with synonyms
    words1_expanded = expand_with_synonyms(words1)
    words2_expanded = expand_with_synonyms(words2)

    # Count word frequencies
    freq1 = Counter(words1_expanded)
    freq2 = Counter(words2_expanded)

    # Create a set of all unique words
    unique_words = set(freq1.keys()).union(set(freq2.keys()))

    # Create frequency vectors
    vector1 = [freq1[word] for word in unique_words]
    vector2 = [freq2[word] for word in unique_words]

    # Convert lists to numpy arrays
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)

    # Calculate cosine similarity
    cosine_similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

    return cosine_similarity

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hivagheisari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
def find_best_keymatch(query,records):
    best_score= 0
    best_record= None
    query_kw= set(query.lower().split())
    for record in records:
        record_kw= set(record.lower().split())
        common_kw= query_kw.intersection(record_kw)
        current_score= len(common_kw)
        if current_score > best_score:
            best_score= current_score
            best_record= record
    return best_score, best_record

In [18]:
best_keyword_score, best_matching_record = find_best_keymatch(query, db_records)

print(f"Best Keyword Score: {best_keyword_score}")
print_formatted_response(best_matching_record)

Best Keyword Score: 3
Response:
---------
A RAG vector store is a database or dataset that contains vectorized data
points.
-------



In [19]:
w= "A RAG vector store is a database or dataset that contains vectorized data points."

In [20]:
w in db_records

True

### Cosine Similarity metric:

In [21]:
calculate_cosine_similarity(query, best_matching_record)

0.12631460871586422

### Enhanced Similarity metric:

In [22]:
calculate_enhanced_similarity(query, best_matching_record)

0.641582812483307

### Augmentation:

In [23]:
augmented_input = query+ ": "+ best_matching_record
augmented_input

'define a rag store: A RAG vector store is a database or dataset that contains vectorized data points.'

In [24]:
print_formatted_response(augmented_input)

Response:
---------
define a rag store: A RAG vector store is a database or dataset that contains
vectorized data points.
-------



### Generation:

In [25]:
llm_response = call_llm_with_full_txt(augmented_input)
print_formatted_response(llm_response)

Response:
---------
Certainly! Let's break down the concept of a RAG vector store and understand its
components and purpose.  ### RAG Vector Store  **RAG** stands for **Retrieval-
Augmented Generation**. It is a framework used in natural language processing
(NLP) that combines the strengths of information retrieval and text generation.
The idea is to enhance the generation of text by retrieving relevant information
from a large dataset, which can then be used to produce more accurate and
contextually relevant responses.  ### Vector Store  A **vector store** is a
specialized database or dataset designed to store and manage vectorized data
points. In the context of machine learning and NLP, data is often represented as
vectors, which are numerical representations of information. These vectors can
capture semantic meanings, relationships, and other features of the data.  ####
Key Characteristics of a Vector Store:  1. **Vectorization**: Data points are
transformed into vectors using techn

### Advanced RAG:

## Vector Search:

The metrics are the same for both similarity methods as for naïve RAG.

In [26]:
def find_best_match(text_input, records):
    best_score =0
    best_record=None
    for record in records:
        current_score = calculate_cosine_similarity(text_input,record)
        if current_score > best_score:
            best_score= current_score
            best_record= record

        return best_score, best_record

In [27]:
best_similarity_score, best_matching_record = find_best_match(query, db_records)

In [28]:
print_formatted_response(best_matching_record)

Response:
---------
Retrieval Augmented Generation (RAG) represents a sophisticated hybrid approach
in the field of artificial intelligence, particularly within the realm of
natural language processing (NLP).
-------



In [29]:
best_similarity_score

0.04182819438170016

In [30]:
calculate_enhanced_similarity(query, best_matching_record)

0.3785786642252672

### Augmented input:
Let’s now augment the user query with this information retrieved.

In [31]:
augmented_input_= query+ ":"+ best_matching_record

In [32]:
print_formatted_response(augmented_input_)

Response:
---------
define a rag store:Retrieval Augmented Generation (RAG) represents a
sophisticated hybrid approach in the field of artificial intelligence,
particularly within the realm of natural language processing (NLP).
-------



### Generation

In [35]:
call_llm_with_full_txt(augmented_input_)

'Retrieval-Augmented Generation (RAG) is an advanced technique in artificial intelligence, specifically within the domain of natural language processing (NLP). It combines two key components: retrieval and generation, to enhance the performance of language models.\n\n1. **Retrieval Component**: This part of RAG involves searching for and retrieving relevant information from a large corpus or database. When a query or prompt is given, the retrieval system scans through a vast amount of data to find documents or pieces of information that are most relevant to the query. This is akin to how search engines work, where they pull up the most pertinent web pages based on a user\'s search terms.\n\n2. **Augmented Generation Component**: Once the relevant information is retrieved, the generation component comes into play. This involves using a language model to generate a response or output that is informed by the retrieved information. The generation process is "augmented" because it is not so

In [36]:
print_formatted_response(call_llm_with_full_txt(augmented_input_))

Response:
---------
Retrieval-Augmented Generation (RAG) is an advanced technique in artificial
intelligence, specifically within the domain of natural language processing
(NLP). It combines two key components: retrieval and generation, to enhance the
performance of language models.  1. **Retrieval Component**: This part of RAG
involves searching through a large corpus of documents or data to find relevant
information that can aid in generating a more accurate and contextually
appropriate response. The retrieval process is akin to how search engines work,
where the system identifies and extracts pertinent information from a vast
database based on the input query.  2. **Generation Component**: Once the
relevant information is retrieved, the generation component takes over. This
involves using a language model to generate a coherent and contextually relevant
response or output. The generation process leverages the retrieved information
to produce more informed and precise responses than 

## Index-Based Search:

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def set_up_vectorizer(records):
    vectorizer=TfidfVectorizer()
    tfidf_matrix= vectorizer.fit_transform(records)
    return vectorizer, tfidf_matrix

def find_best_match(query, vectorizer, tfidf_matrix):
    query_tfidf= vectorizer.transform([query])
    similarities= cosine_similarity(query_tfidf, tfidf_matrix)
    best_index= similarities.argmax()
    best_score= similarities[0, best_index]
    return best_score, best_index



vectorizer, tfidf_matrix= set_up_vectorizer(db_records)
best_similarity_score, best_index = find_best_match(query, vectorizer, tfidf_matrix)
best_matching_record = db_records[best_index]

In [51]:
print_formatted_response(best_matching_record)
find_best_match(query, vectorizer, tfidf_matrix)

Response:
---------
A RAG vector store is a database or dataset that contains vectorized data
points.
-------



(0.40746671865845496, 27)

### Augmented input:

In [46]:
augmented_input=query+": "+best_matching_record
print_formatted_response(augmented_input)

Response:
---------
define a rag store: A RAG vector store is a database or dataset that contains
vectorized data points.
-------



In [48]:
print_formatted_response(call_llm_with_full_txt(augmented_input))

Response:
---------
Certainly! Let's break down the concept of a RAG vector store and understand its
components in detail.  ### RAG (Retrieval-Augmented Generation)  RAG stands for
Retrieval-Augmented Generation, which is a framework used in natural language
processing (NLP) to enhance the generation of text by incorporating external
information retrieval. The idea is to improve the quality and relevance of
generated content by retrieving relevant information from a large dataset or
database before generating the final output. This approach is particularly
useful in tasks where the model needs to generate responses based on a vast
amount of information, such as question answering or conversational agents.  ###
Vector Store  A vector store is a specialized type of database or dataset that
stores data in the form of vectors. In the context of machine learning and NLP,
vectors are numerical representations of data points. These vectors are often
generated through a process called "embeddi

In [60]:
# Feature extraction

set_up_vectorizer(db_records)

(TfidfVectorizer(),
 <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 630 stored elements and shape (28, 297)>)

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(db_records)
vectorizer.get_feature_names_out()

array(['ability', 'access', 'accuracy', 'accurate', 'adapt', 'additional',
       'advancement', 'advice', 'algorithms', 'allows', 'also', 'amounts',
       'and', 'answers', 'approach', 'appropriate', 'architectures',
       'are', 'areas', 'artificial', 'as', 'associated', 'at',
       'augmented', 'available', 'based', 'be', 'behind', 'benefits',
       'bert', 'best', 'biases', 'bidirectional', 'both', 'built', 'but',
       'by', 'can', 'capabilities', 'capable', 'challenge', 'challenges',
       'coherent', 'combines', 'comes', 'complementing', 'complex',
       'complexity', 'component', 'comprehend', 'computational',
       'contained', 'contains', 'content', 'context', 'contextually',
       'core', 'corpus', 'correct', 'correctness', 'create', 'crucial',
       'current', 'data', 'database', 'databases', 'dataset', 'date',
       'deeply', 'depth', 'detailed', 'details', 'determination',
       'determined', 'direct', 'directly', 'documents', 'does', 'domains',
       'dynami

In [70]:
len(vectorizer.get_feature_names_out())

297

In [72]:
len(db_records)

28

In [73]:
tfidf_matrix.shape

(28, 297)

In [75]:
db_records[0]

'Retrieval Augmented Generation (RAG) represents a sophisticated hybrid approach in the field of artificial intelligence, particularly within the realm of natural language processing (NLP).'

In [74]:
print(tfidf_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 630 stored elements and shape (28, 297)>
  Coords	Values
  (0, 222)	0.10091348627954093
  (0, 23)	0.1539082835028258
  (0, 109)	0.1464357841118198
  (0, 203)	0.16226182711181428
  (0, 214)	0.23182532487993252
  (0, 238)	0.26058218654196424
  (0, 117)	0.26058218654196424
  (0, 14)	0.26058218654196424
  (0, 119)	0.1464357841118198
  (0, 260)	0.16868957146941504
  (0, 95)	0.23182532487993252
  (0, 176)	0.2262032947934785
  (0, 19)	0.23182532487993252
  (0, 132)	0.23182532487993252
  (0, 186)	0.23182532487993252
  (0, 295)	0.26058218654196424
  (0, 207)	0.26058218654196424
  (0, 165)	0.23182532487993252
  (0, 141)	0.13967608617281693
  (0, 195)	0.26058218654196424
  (0, 171)	0.26058218654196424
  (1, 222)	0.09418296782117092
  (1, 109)	0.136669113825548
  (1, 260)	0.15743866421838287
  (1, 176)	0.21111645648195462
  :	:
  (26, 245)	0.17332710410898458
  (26, 6)	0.17332710410898458
  (26, 158)	0.17332710410898458
  (26, 255)	0.17

In [68]:
set_up_vectorizer(db_records)


TF-IDF Matrix:
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 630 stored elements and shape (28, 297)>
  Coords	Values
  (0, 222)	0.10091348627954093
  (0, 23)	0.1539082835028258
  (0, 109)	0.1464357841118198
  (0, 203)	0.16226182711181428
  (0, 214)	0.23182532487993252
  (0, 238)	0.26058218654196424
  (0, 117)	0.26058218654196424
  (0, 14)	0.26058218654196424
  (0, 119)	0.1464357841118198
  (0, 260)	0.16868957146941504
  (0, 95)	0.23182532487993252
  (0, 176)	0.2262032947934785
  (0, 19)	0.23182532487993252
  (0, 132)	0.23182532487993252
  (0, 186)	0.23182532487993252
  (0, 295)	0.26058218654196424
  (0, 207)	0.26058218654196424
  (0, 165)	0.23182532487993252
  (0, 141)	0.13967608617281693
  (0, 195)	0.26058218654196424
  (0, 171)	0.26058218654196424
  (1, 222)	0.09418296782117092
  (1, 109)	0.136669113825548
  (1, 260)	0.15743866421838287
  (1, 176)	0.21111645648195462
  :	:
  (26, 245)	0.17332710410898458
  (26, 6)	0.17332710410898458
  (26, 158)	0.17332710410898458
