# Setup

In [None]:
import pandas as pd
import numpy as np
import os 
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random
import time
import logging
from tenacity import retry, wait_exponential, stop_after_attempt
from langchain_groq import ChatGroq

In [None]:
data_file = "output2.txt"
output_dir = Path(f"./output_dir/")

In [None]:
import re
def parse_output_file(filepath):
    text =''
    paths,contents = [],[]
    with open(filepath, 'r',encoding='utf-8') as f:
        text = f.read()
    matches = re.finditer(r'C:/.*',text)
    matches1 = [match for match in matches]# make this the last section of the codebase
    for match in matches1:
        match = match.group(0)
        match = match.split('/')[-1]
        paths.append(match)
    for match, match_next in zip(matches1[:-1],matches1[1:]):
        ending_index = match.span()[1]
        starting_index = match_next.span()[0]
        contents.append(text[ending_index:starting_index])
    return paths,contents
    #appending the last file
    # unique_dirs = set()
    # contents.append(text[matches1[-1].span()[1]:])
    # for path in paths:
    #     path = path.replace('/','\\')
    #     dir = path.split('\\')  
    #     dir = dir[-heirarchy]
    #     unique_dirs.add(dir)
    
    # dir_contents = {}
    # for dir in unique_dirs:
    #     dir_contents[dir] = []
    # file_names = {}
    # for path, content in zip(paths, contents):
    #     path = path.replace('/', '\\')
    #     dir = path.split('\\')[-heirarchy]
    #     dir_contents[dir]+=str(content)
    #     file_names[dir] = path.split('\\')[-1]
    
    # return file_names, dir_contents, unique_dirs
paths,contents = parse_output_file(data_file)
print(paths[0],contents[0][:10])

In [None]:
from langchain.schema import Document
class CustomTextLoader(TextLoader):
    def __init__(self, file_path, **kwargs):
        super().__init__(file_path, **kwargs)
        self.file_path = file_path

    def load(self):
        paths, contents = parse_output_file(self.file_path)
        documents = []
        for path, content in zip(paths, contents):
            metadata = {"source": path}
            documents.append(Document(page_content=content.strip(), metadata=metadata))
        return documents
loader = CustomTextLoader(data_file, encoding='utf-8')
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap = 100,
    length_function = len,
    is_separator_regex=False
)
pages = splitter.split_documents(documents)
print(len(pages))
pages = pages[10000:10100]  
print(pages[3].page_content[:10])

In [None]:
import httpx
import os

os.environ["GROQ_API_KEY"]  = str(os.getenv("GROQ_API_KEY"))

http_client = httpx.Client(verify=False)
llm = ChatGroq(
    model="llama-3.1-70b-versatile",
    temperature=0.3,
    http_client=http_client,
    max_tokens=100,
    # other params...
)
logging.basicConfig(level=logging.INFO)

In [None]:
import sys
import json
from yachalk import chalk
from langchain_openai import ChatOpenAI,OpenAI


# Creating all the utility functions

In [None]:


# Append the parent directory to the system path
sys.path.append("..")

# Initialize the ChatOpenAI client

def trim_incomplete_json(json_string):
    # Find the last occurrence of '}]' or '},' in the string
    last_complete = max(json_string.rfind('}]'), json_string.rfind('},'))
    
    if last_complete != -1:
        # If found, trim the string to that point and add closing bracket if needed
        trimmed = json_string[:last_complete+1]
        if not trimmed.endswith(']'):
            trimmed += ']'
        return trimmed
    else:
        # If no complete object found, return empty list
        return '[]'

def extract_concepts(prompt: str, metadata: dict = {}) -> list:
    SYS_PROMPT = (
        "Your task is to extract the key concepts (and non-personal entities) mentioned in the given context. "
        "Extract only the most important and atomistic concepts, breaking them down into simpler concepts if needed. "
        "Categorize the concepts into one of the following categories: "
        "[import statement, concept, function definition, object-calling, document, class-definition, condition, misc].\n"
        "Format your output as a list of JSON objects in the following format:\n"
        "[\n"
        "   {\n"
        '       "entity": "The Concept",\n'
        '       "importance": "The contextual importance of the concept on a scale of 1 to 5 (5 being the highest)",\n'
        '       "category": "The Type of Concept"\n'
        "   },\n"
        "   {...}\n"
        "]"
    )

    messages = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": prompt}
    ]
    
    response = llm.invoke(input=messages).content
    print("Extract Prompt ", response)
    try:
        result = json.loads(response)
    except json.JSONDecodeError:
        print("\n\nWARNING ### Incomplete JSON detected. Attempting to trim...")
        trimmed_response = trim_incomplete_json(response)
        print(trimmed_response+"\n#####################################################################################################")
        try:
            result = json.loads(trimmed_response)
        except json.JSONDecodeError:
            print("\n\nERROR ### Failed to parse even after trimming. Here is the buggy response: ", response, "\n\n")
            return None

    if result is not None:
        result = [dict(item, **metadata) for item in result]

    return result

def graph_prompt(input_text: str, metadata: dict = {}) -> list:
    SYS_PROMPT = (
        "You are a network graph maker who extracts terms and their relations from a given context. "
        "You are provided with a context chunk (delimited by ```). Your task is to extract the ontology "
        "of terms mentioned in the given context. These terms should represent the key concepts according to the context.\n"
        "Thought 1: While traversing through each sentence, think about whether Data is being passed to it\n"
        "\tTerms may include object creation, entity, class definition, import file, function signature, \n"
        "\tcondition, parameters, documents, service, concept, etc.\n"
        "\tTerms should be as concise as possible but ignore vague definitions\n\n"
        "Thought 2: Think about how these terms can have one-on-one relations with other terms.\n"
        "\tTerms mentioned in the same code or file are typically related to each other.\n"
        "\tTerms can be related to many other terms.\n\n"
        "Thought 3: Determine the relation between each related pair of terms.\n\n"
        "Format your output as a list of JSON objects. Each element of the list contains a pair of terms do not provide an explanation, JUST THE JSON OUTPUT "
        "and the relationship between them, as follows:\n"
        "[\n"
        "   {\n"
        '       "node_1": "A concept from the extracted ontology",\n'
        '       "node_2": "A related concept from the extracted ontology",\n'
        '       "edge": "The relationship between node_1 and node_2 in one or two sentences"\n'
        "   },\n"
        "   {...}\n"
        "]"
    )

    USER_PROMPT = f"context: ```{input_text}``` \n\n output: "

    messages = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": USER_PROMPT}
    ]

    response = llm.invoke(input=messages).content
    # print("Graph Prompt ", response)
    try:
        result = json.loads(response)
    except json.JSONDecodeError:
        # print("\n\nWARNING ### Incomplete JSON detected. Attempting to trim...")
        trimmed_response = trim_incomplete_json(response)
        # print(trimmed_response)
        # print("################################################################################################################")
        try:
            result = json.loads(trimmed_response)
        except json.JSONDecodeError:
            print("\n\nERROR ### Failed to parse even after trimming. Here is the buggy response: ")
            return None

    if result is not None:
        result = [dict(item, **metadata) for item in result]

    return result

## Dataframe and graph manipulation

In [None]:
import uuid
import pandas as pd
import numpy as np


def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            "path": chunk.metadata["source"],
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df


def df2ConceptsList(dataframe: pd.DataFrame) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        lambda row: extract_concepts(
            row.text, {"chunk_id": row.chunk_id, "type": "concept"}
        ),
        axis=1,
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list


def concepts2Df(concepts_list) -> pd.DataFrame:
    ## Remove all NaN entities
    concepts_dataframe = pd.DataFrame(concepts_list).replace(" ", np.nan)
    concepts_dataframe = concepts_dataframe.dropna(subset=["entity"])
    concepts_dataframe["entity"] = concepts_dataframe["entity"].apply(
        lambda x: x.lower()
    )

    return concepts_dataframe


def df2Graph(dataframe: pd.DataFrame, model=None) -> list:
    total_rows = len(dataframe)
    processed_rows = 0
    start_time = time.time()

    def process_row(row):
        nonlocal processed_rows
        result = graph_prompt(row.text, {"chunk_id": row.chunk_id,"path":row.path})
        processed_rows += 1
        elapsed_time = time.time() - start_time
        avg_time_per_row = elapsed_time / processed_rows
        estimated_time_remaining = (total_rows - processed_rows) * avg_time_per_row

        print(f"\rProcessing: {processed_rows}/{total_rows} rows | "
              f"Elapsed: {elapsed_time:.2f}s | "
              f"Estimated time remaining: {estimated_time_remaining:.2f}s", 
              end="", flush=True)
        return result

    results = dataframe.apply(process_row, axis=1)

    print("\nProcessing complete!")
    # print(results)

    # Filter out None values and flatten the list of lists to one single list of entities.
    concept_list = [item for sublist in results if sublist is not None for item in sublist]
    return concept_list

def graph2Df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe

In [None]:

df = documents2Dataframe(pages)

# Actual processing

In [None]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df)
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    dfg1.to_csv(output_dir/"graph.csv", sep="|", index=False)
    df.to_csv(output_dir/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(output_dir/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  


## connecting node with more contextual proximity

adding count to the edges to design strength

In [None]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    # Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id", "path"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)

    # Self join with chunk id as the key will create a link between terms occurring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on=["chunk_id", "path"], suffixes=("_1", "_2"))

    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)

    # Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"], "path": "first"})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count", "path"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)

In [None]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({
        "chunk_id": ",".join,
        "edge": ",".join,
        "count": "sum",
        "path": lambda x: ",".join(set(x))  # Combine unique paths
    })
    .reset_index()
)


In [None]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

# Drawing the graph

In [None]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

In [None]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

In [None]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)


In [None]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [None]:
import networkx as nx
from pyvis.network import Network


k_value = 2  # Adjust this value to change node spacing
pos = nx.spring_layout(G, k=k_value, iterations=50)


net = Network(notebook=False, cdn_resources="remote", height="1200px", width="1200px")


for node, (x, y) in pos.items():
    net.add_node(
        node,
        x=x * 200, 
        y=y * 200,
        physics=True,  
        **G.nodes[node] 
    )

# Add edges to the Pyvis network
for source, target, edge_attrs in G.edges(data=True):
    edge_data = edge_attrs.copy()
    edge_data['value'] = edge_data['weight']
    net.add_edge(
        source,
        target,
        **edge_data
    )

# Disable physics in Pyvis to maintain the NetworkX layout
net.toggle_physics(True)

# Save the network
net.show("./docs/index.html", notebook=False)

# Making a Final Dataframe

In [None]:
community_map = colors.set_index('node')['group'].to_dict()


# Create new columns with default values
dfg['community_node_1'] = dfg['node_1'].map(community_map)
dfg['community_node_2'] = dfg['node_2'].map(community_map)


In [None]:
community_count_node_1 = dfg['community_node_1'].value_counts(dropna=False)

# Count the number of nodes in each community for 'community_node_2'
community_count_node_2 = dfg['community_node_2'].value_counts(dropna=False)

# Output the results
# print("Community counts for 'community_node_1':")
# print(community_count_node_1)
# print("\nCommunity counts for 'community_node_2':")
# print(community_count_node_2)

In [None]:
dfg.to_csv('updated_graph.csv', index=False)

# Creating an Agent

## Testing groq

In [None]:
import os
os.environ["GROQ_API_KEY"]  ="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"

In [None]:
from langchain_groq import ChatGroq
import httpx
http_client = httpx.Client(verify=False)
llm = ChatGroq(
    model="llama3-8b-8192",
    temperature=0.1,
    http_client=http_client,
    max_tokens=1000,
    timeout=30,
    max_retries=2,
    # other params...
)

In [None]:
from langchain.agents.agent_types import AgentType
from langchain.agents.initialize import initialize_agent
from langchain.tools import Tool
from langchain_core.prompts import PromptTemplate
import pandas as pd
import os
import ast
import re
import time
import logging
from tenacity import retry, wait_exponential, stop_after_attempt

logging.basicConfig(level=logging.INFO)

# Load the CSV file
try:
    df = pd.read_csv('summary_output_filtered.csv')
    df[['start_line', 'end_line']] = df['line_numbers'].str.split('-', expand=True)
except Exception as e:
    print(f"Error loading CSV file: {e}")
    df = pd.DataFrame()  # Create an empty DataFrame if file loading fails

# Function to get a section of the CSV
def get_csv_section(df, start_row, end_row):
    try:
        df_filtered = df.drop(columns=['chunk_id'])
        return df_filtered.iloc[start_row:end_row].to_dict(orient='records')
    except Exception as e:
        print(f"Error getting CSV section: {e}")
        return []

# Custom tool to read file contents
def read_file_contents(file_path, start_line, end_line):
    try:
        file_path = file_path.strip("'\"")
        full_path = os.path.join(os.getcwd(), 'TestRepositories', file_path)
        full_path = os.path.normpath(full_path)
        
        if os.path.exists(full_path):
            with open(full_path, 'r', encoding='utf-8', errors='ignore') as file:
                logging.info(f"Reading file: {full_path}")
                lines = file.readlines()
                snippet = "".join(lines[start_line-1:end_line])
                return snippet   # Truncate to 5000 characters
        return f"File not found: {full_path}"
    except Exception as e:
        return f"Error reading file: {e}"

file_reader_tool = Tool(
    name="FileReader",
    func=read_file_contents,
    description="Reads the contents of a file given its path"
)

def write_markdown_section(identified_nodes, test_scripts, output_file="output1.md", append=False):
    mode = "a" if append else "w"
    with open(output_file, mode) as f:
        if not append:
            f.write("# LangChain Test Automation Results\n\n")
        
        f.write("## Identified Chunks\n\n")
        for node in identified_nodes:
            f.write(f"- Coverage_file 1: {node['node_1']}\n")
            f.write(f"  Coverage_file 2: {node['node_2']}\n")
            f.write(f"  Task: {node['edge']}\n")
        
        f.write("## Generated Test Scripts\n\n")
        for i, script in enumerate(test_scripts, 1):
            f.write(f"### Script {i}\n\n")
            f.write("```python\n")
            f.write(script)
            f.write("\n```\n\n")
            f.write("---\n\n")

# Custom tool to generate test automation scripts
@retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
def generate_test_script(inputs):
    try:
        dictionary = ast.literal_eval(inputs)
        node_1 = dictionary['node_1']
        node_2 = dictionary['node_2']
        edge = dictionary['edge']
        path = dictionary['path']
        file_contents = dictionary.get('file_contents', 'File contents not provided')[:2000]  # Truncate to 2000 characters

        template = """
        Create a Python test automation script for the following scenario:
        - Node 1: {node_1}
        - Node 2: {node_2}
        - Edge: {edge}
        - Path: {path}
        - File Contents: {file_contents}

        Note: contextual proximity implies that while creating the graph they appeared in the same chunk of text.
        The script should:
        1. Set up the test environment.
        2. Read the file contents from the given path if you need more information.
        3. Provide detailed test cases for testing the file and its impact on a system level.
        4. Write comprehensive tests for the edge with respect to the file contents or assume data if necessary.
        5. Clean up the test environment.

        Please provide the complete Python script (assume data if necessary).
        """
        prompt = PromptTemplate(
            input_variables=["node_1", "node_2", "edge", "path", 'file_contents'],
            template=template
        )
        return llm.invoke(prompt.format(node_1=node_1, node_2=node_2, edge=edge, path=path, file_contents=file_contents))[:2000]  # Truncate to 2000 characters
    except Exception as e:
        return f"Error generating test script: {e}"

test_generator_tool = Tool(
    name="TestGenerator",
    func=generate_test_script,
    description="Generates a test automation script given inputs in dictionary form with the keys 'node_1', 'node_2', 'edge', and 'path'"
)

# Create the main agent
tools = [file_reader_tool, test_generator_tool]

try:
    agent = initialize_agent(
        tools,
        llm,
        agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
        verbose=False,
        handle_parsing_errors=True
    )
except Exception as e:
    print(f"Error initializing agent: {e}")
    agent = None

# Function to extract list of dictionaries from LLM output
def extract_dicts_from_llm_output(output):
    try:
        dict_pattern = r'\{[^{}]*\}'
        dict_strings = re.findall(dict_pattern, output)
        
        dicts = []
        for d_str in dict_strings:
            try:
                d = ast.literal_eval(d_str)
                if isinstance(d, dict) and all(key in d for key in ['node_1', 'node_2', 'edge', 'priority', 'path']):
                    dicts.append(d)
            except:
                pass
        
        return dicts
    except Exception as e:
        print(f"Error extracting dictionaries from LLM output: {e}")
        return []

# Process the CSV in sections
section_size = 5  # Reduced from 10 to 5
total_rows = len(df)
test_scripts = []
identified_nodes = []

start_time = time.time()
total_sections = (total_rows + section_size - 1) // section_size

for section, start_row in enumerate(range(0, total_rows, section_size), 1):
    section_start_time = time.time()
    try:
        end_row = min(start_row + section_size, total_rows)
        csv_section = get_csv_section(df, start_row, end_row)

        elapsed_time = time.time() - start_time
        estimated_total_time = (elapsed_time / section) * total_sections
        estimated_time_left = estimated_total_time - elapsed_time

        print(f"\nProcessing rows {start_row} to {end_row - 1} (Section {section}/{total_sections}):")
        print(f"Elapsed time: {elapsed_time:.2f} seconds")
        print(f"Estimated time left: {estimated_time_left:.2f} seconds")

        llm_context = f"""
        Here is a section of the CSV file (rows {start_row} to {end_row - 1}):

        {csv_section}

        Select which values of nodes, edges, paths are most relevant for SYSTEM LEVEL test automation scripts. Do not repeat nodes and edges.
        Assign them high or low priority for testing.
        Write them in dictionary format with keys: 'node_1','node_2','edge','priority','path'.
        Provide your selections as a list of dictionaries.
        """[:2000]  # Truncate to 2000 characters

        llm_plan = llm.invoke(llm_context)
        time.sleep(10)  # Increased delay between API calls

        plan_list = extract_dicts_from_llm_output(llm_plan.content)

        for item in plan_list:
            node_1 = item['node_1']
            node_2 = item['node_2']
            edge = item['edge']
            path = item['path']
            priority = item['priority']
            
            identified_nodes.append({
                'node_1': node_1,
                'node_2': node_2,
                'edge': edge,
                'priority': priority
            })
            
            script_input = str({
                'node_1': node_1,
                'node_2': node_2,
                'edge': edge,
                'path': path
            })[:2000]  # Truncate to 2000 characters
            if agent:
                result = agent.run(f"Use the TestGenerator tool to create a test script for: {script_input}")
                print("The Script input for the following nodes are : ", script_input)
                print("-----------------------------------------------------------------------------------------------------------------")
                print("The result for the following nodes are : ", result)
                time.sleep(10)  # Increased delay between API calls
                test_scripts.append(result[:2000])  # Truncate to 2000 characters
            else:
                print("Agent not initialized. Skipping test script generation.")

        write_markdown_section(identified_nodes, test_scripts, append=(section > 1))

        identified_nodes = []
        test_scripts = []
        print(identified_nodes, test_scripts)
        print("##############################################################################################################")
        section_time = time.time() - section_start_time
        print(f"Time taken for this section: {section_time:.2f} seconds")
        time.sleep(10)  # Increased delay between sections

    except Exception as e:
        print(f"Error processing rows {start_row} to {end_row - 1}: {e}")

total_time = time.time() - start_time
print(f"\nTotal processing time: {total_time:.2f} seconds")
print("Results have been written to output.md")