## Data Loading and Vector Store

In [1]:
import pandas as pd
import os

data = pd.read_csv('./output_dir/summarized_output.csv')
data.head()
# Lets pass this as a chunk to LLM and Identify the file and lines that would be useful for system testing.


Unnamed: 0,line_numbers,path,main_function_inferred
0,1-91,vram\EmulatorPkg\Library\PlatformBmLib\Platfor...,Here is a one-line summary of the code snippet...
1,1-67,vram\EmulatorPkg\ThunkPpiToProtocolPei\ThunkPp...,Here is a one-line summary of the code snippet...
2,1-10,vram\EmulatorPkg\Library\SecPpiListLib\PpiList...,Here is a one-line summary of the code snippet...
3,1-137,vram\EmulatorPkg\Unix\Host\BerkeleyPacketFilter.c,Here is a one-line summary of the main functio...
4,138-290,vram\EmulatorPkg\Unix\Host\BerkeleyPacketFilter.c,Here is a one-line summary of the main functio...


In [2]:
unittest_indices =[]
for index, row in data.iterrows():
    if 'UnitTestFrameworkPkg' in row['path']:
        unittest_indices.append(index)

data.drop(unittest_indices, inplace=True)

In [3]:
data.shape
data.to_csv('./output_dir/summarized_output_filtered.csv', index=False)

Lets make a chunk of this code.

In [4]:
from langchain.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path="./output_dir/summarized_output_filtered.csv", encoding="utf-8", csv_args={
            'delimiter': ','})
data = loader.load()

In [5]:
len(data)

12357

In [6]:
from tqdm.autonotebook import tqdm, trange
from langchain_community.embeddings import HuggingFaceEmbeddings
embedding_model = "all-MiniLM-L6-v2"
embedding = HuggingFaceEmbeddings(model_name =embedding_model)

  from tqdm.autonotebook import tqdm, trange


In [7]:
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
vector_store = FAISS.from_documents(data, embedding)


In [8]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import networkx as nx
from scipy.spatial.distance import pdist, squareform

def girvan_newman_clustering(embeddings, k):
    # Create a graph from the embeddings
    with tqdm(total=3, desc="Clustering") as pbar:
        distances = pdist(embeddings)
        dist_matrix = squareform(distances)
        G = nx.from_numpy_array(dist_matrix)
        pbar.update(1)

        # Perform Girvan-Newman clustering
        comp = nx.community.girvan_newman(G)
        pbar.update(1)
        
        # Get the first k communities
        for _ in tqdm(range(k-1), desc="Finding communities", leave=False):
            communities = next(comp)
        pbar.update(1)
    
    return communities

def plot_faiss_vector_store_clustered(vector_store, n_components=2, perplexity=30, n_iter=1000, n_clusters=5):
    # Retrieve embeddings and documents
    with tqdm(total=2, desc="Retrieving data") as pbar:
        embeddings = vector_store.index.reconstruct_n(0, vector_store.index.ntotal)
        pbar.update(1)
        docs = list(vector_store.docstore._dict.values())
        pbar.update(1)
    
    # Extract short text snippets for labels
    texts = [doc.page_content.split('\n')[1][-10:] for doc in tqdm(docs, desc="Extracting labels")]
    
    # Perform t-SNE
    with tqdm(total=1, desc="Performing t-SNE") as pbar:
        tsne = TSNE(n_components=n_components, perplexity=perplexity, max_iter=n_iter, random_state=42)
        embeddings_2d = tsne.fit_transform(embeddings)
        pbar.update(1)
    
    # Perform Girvan-Newman clustering
    communities = girvan_newman_clustering(embeddings_2d, n_clusters)
    
    # Assign colors to communities
    color_map = plt.cm.get_cmap('viridis')
    colors = [color_map(i / n_clusters) for i in range(n_clusters)]
    
    # Plot
    with tqdm(total=1, desc="Plotting") as pbar:
        plt.figure(figsize=(12, 8))
        for i, community in enumerate(communities):
            community = list(community)
            plt.scatter(embeddings_2d[community, 0], embeddings_2d[community, 1], 
                        c=[colors[i]], alpha=0.7, label=f'Cluster {i+1}')
        
        # Add labels for a few random points
        n_labels = min(10, len(texts))
        indices = np.random.choice(len(texts), n_labels, replace=False)
        for i in indices:
            plt.annotate(texts[i], (embeddings_2d[i, 0], embeddings_2d[i, 1]), fontsize=8)
        
        plt.title(f"t-SNE visualization of FAISS vector store with Girvan-Newman clustering\n(perplexity={perplexity}, n_iter={n_iter}, n_clusters={n_clusters})")
        plt.xlabel("t-SNE feature 0")
        plt.ylabel("t-SNE feature 1")
        plt.legend()
        
        plt.tight_layout()
        pbar.update(1)
    
    plt.show()

# Usage
plot_faiss_vector_store_clustered(vector_store, n_clusters=20)

Retrieving data:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting labels:   0%|          | 0/12357 [00:00<?, ?it/s]

Performing t-SNE:   0%|          | 0/1 [00:00<?, ?it/s]

Clustering:   0%|          | 0/3 [00:00<?, ?it/s]

Finding communities:   0%|          | 0/4 [00:00<?, ?it/s]

## DDC

In [34]:
available_models = [
    "mixtral-8x7b-instruct-v01", 
    "gemma-7b-it", 
    "mistral-7b-instruct-v02", 
    "llama-2-70b-chat", 
    "phi-3-mini-128k-instruct", 
    "llama-3-8b-instruct"]

In [None]:
from langchain_openai import ChatOpenAI, OpenAI
import httpx
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv('API_KEY')
base_url = os.getenv('API_URL')
http_client = httpx.Client(verify=False)
model_selected = available_models[5]
global_texts = None
langchain_llm = ChatOpenAI(
    base_url=base_url,
    model=model_selected,
    http_client=http_client,
    api_key=api_key
)

## GROQ

In [None]:
from langchain_groq import ChatGroq
os.environ["GROQ_API_KEY"] =str(os.getenv("GROQ_API_KEY"))
# Your existing setup code
http_client = httpx.Client(verify=False)
llm = ChatGroq(
    model="llama3-70b-8192",
    temperature=0.3,
    http_client=http_client,
    max_tokens=5000,
)

In [None]:
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent

## Running The chain

In [None]:
query = """Your task is to provide me with line-numbers and paths of sections that would be useful for system-level testing you can decide what type of system-level testing you can perform avoid unittesting at all costs.
            The output should be in a list of JSON format with keys as:
            1) coverage: this represents what all file_names would be covered in the test.
            2) path: the path of the all important files.
            3) line-numbers: the line number range that I should visit to utilize to conduct these tests.
            Note: DO NOT USE THE FILES WHICH HAVE 'UnitTest' OR 'Sample' AS A PART OF THEIR NAME.
            Give me as many JSON outputs as you can to cover the whole content."""

In [None]:
qa_chain = RetrievalQA.from_chain_type(
                llm=llm,
                chain_type="stuff",
                retriever=vector_store.as_retriever()
            )
result = qa_chain.invoke(query)            
print(result['result'])


Based on the provided context, I'll provide you with some suggestions for system-level testing. Since the context only provides information about the MtrrLib library, I'll focus on that. Here are some JSON outputs that might be useful for system-level testing:

**Test 1: MTRR Configuration**
```json
{
  "coverage": ["MtrrLib"],
  "path": ["vram/UefiCpuPkg/Library/MtrrLib/"],
  "line-numbers": ["10-20"]
}
```
This test would cover the MtrrLib library's configuration and setup, focusing on the lines that initialize and set up the MTRR settings.

**Test 2: Memory Attribute Management**
```json
{
  "coverage": ["MtrrLib"],
  "path": ["vram/UefiCpuPkg/Library/MtrrLib/"],
  "line-numbers": ["30-40"]
}
```
This test would cover the MtrrLib library's memory attribute management, focusing on the lines that set and get memory attributes in the MTRR settings.

**Test 3: MTRR Register Access**
```json
{
  "coverage": ["MtrrLib"],
  "path": ["vram/UefiCpuPkg/Library/MtrrLib/"],
  "line-numbers": ["