# RAG Based Llamas

## Data Retrieval

In [3]:
import json

In [4]:
def load_json_for_retrieval(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

data = load_json_for_retrieval('Dataset/output_data.json')


In [5]:
data

[{'code': 'def Make Suite From Dict d label None suite Suite label label suite Set Dict d suite Normalize return suite',
  'question': 'What does the code make ?',
  'answer': 'a suite'},
 {'code': 'def Make Suite From Dict d label None suite Suite label label suite Set Dict d suite Normalize return suite',
  'question': 'Does the code make a suite ?',
  'answer': 'Yes'},
 {'code': "def receive message topic name subscription name pubsub client pubsub Client topic pubsub client topic topic name subscription topic subscription subscription name results subscription pull return immediately True print ' Received{}messages ' format len results for ack id message in results print '*{} {} {}' format message message id message data message attributes if results subscription acknowledge [ack id for ack id message in results]",
  'question': 'Does the code receive a message from a pull subscription ?',
  'answer': 'Yes'},
 {'code': "def receive message topic name subscription name pubsub client

In [6]:
from dotenv import load_dotenv

load_dotenv()

True

In [7]:
client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

NameError: name 'Groq' is not defined

In [8]:
import os
import json
from groq import Groq  # Assuming Groq supports custom context or has a mechanism to include it

# Load the JSON data into memory
def load_json_data(json_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Simple retrieval based on substring matching in questions
def retrieve_documents(query, data):
    # Find documents where the query matches part of the question
    matches = [entry for entry in data if query.lower() in entry['question'].lower()]
    # Extract text for each match to form the context
    documents = ["Code: {}\nQuestion: {}\nAnswer: {}".format(entry['code'], entry['question'], entry['answer']) for entry in matches]
    return documents

# Generate a response using the Groq API
def generate_response(query, documents):
    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
    combined_input = " ".join(documents) + " " + query  # Combine documents and query
    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": combined_input,
            }
        ],
        model="llama-3.3-70b-versatile",
    )
    return response.choices[0].message.content

# Main execution
if __name__ == "__main__":
    # Define the paths and load data
    json_path = 'Dataset/output_data.json'
    data = load_json_data(json_path)

    # Example usage
    user_query = "What be an explicit budget used only?"
    relevant_documents = retrieve_documents(user_query, data)
    response = generate_response(user_query, relevant_documents)
    print(relevant_documents)


[]


## Cosine Similarity

In [9]:
!pip install numpy scikit-learn


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def create_tfidf_vectors(data):
    documents = ["{} {}".format(entry['question'], entry['code']) for entry in data]  # Combining code and question for context
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    return tfidf_matrix, vectorizer

def retrieve_documents(query, data, tfidf_matrix, vectorizer):
    # Vectorize the query using the existing vectorizer
    query_vec = vectorizer.transform([query])
    # Calculate cosine similarity between the query vector and all document vectors
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    # Get the indices of the documents sorted by similarity (descending)
    indices = np.argsort(-similarities)  # Negating similarities to sort in descending order
    # Collect top N similar documents
    top_n = 5  # You can adjust the number of documents you want to retrieve
    matched_documents = [(data[i], similarities[i]) for i in indices[:top_n] if similarities[i] > 0]
    documents = ["Code: {}\nQuestion: {}\nAnswer: {}\nSimilarity: {:.2f}".format(doc[0]['code'], doc[0]['question'], doc[0]['answer'], doc[1]) for doc in matched_documents]
    return documents

# Assuming you have loaded your data into `data` list
tfidf_matrix, vectorizer = create_tfidf_vectors(data)
query = "What does the code make?"
documents = retrieve_documents(query, data, tfidf_matrix, vectorizer)
for doc in documents:
    print(doc)


Code: def all index generator k 10 all make index funcs [make Int Index make Float Index make String Index make Unicode Index make Date Index make Period Index make Timedelta Index make Bool Index make Categorical Index]for make index func in all make index funcs yield make index func k k
Question: For what purpose can generator be iterated ?
Answer: to get instances of all the various index classes
Similarity: 0.51
Code: def plugin return Select Attr
Question: What does the code make ?
Answer: plugin available
Similarity: 0.47
Code: def plugin return Tag Name Select
Question: What does the code make ?
Answer: plugin available
Similarity: 0.47
Code: def all timeseries index generator k 10 make index funcs [make Date Index make Period Index make Timedelta Index]for make index func in make index funcs yield make index func k k
Question: What do all the classes represent ?
Answer: time - seires
Similarity: 0.47
Code: def all timeseries index generator k 10 make index funcs [make Date Inde

In [11]:
def load_json_data(json_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def create_tfidf_vectors(data):
    # Combining code and question for more contextual vectorization
    documents = [entry['question'] + " " + entry['code'] for entry in data]
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(documents)
    return vectorizer, tfidf_matrix


In [12]:
def retrieve_documents(query, data, vectorizer, tfidf_matrix):
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_n_indices = np.argsort(-similarities)[:5]  # Retrieves the top 5 documents
    return [data[i] for i in top_n_indices if similarities[i] > 0]
