# Setup

In [19]:
import pandas as pd
import numpy as np
import os 
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random
import time

In [20]:
data_file = "output2.txt"
output_dir = Path(f"./output_dir/")

In [21]:
loader = TextLoader(data_file, encoding='utf-8')
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap = 100,
    length_function = len,
    is_separator_regex=False
)
pages = splitter.split_documents(documents)
print(len(pages))
pages = pages[10000:10300]  
print(pages[3].page_content[:10])

12257
/**
 * Tes


In [22]:
import httpx
api_key = os.getenv('API_KEY')
base_url = os.getenv('API_URL')
max_output_tokens = 300
streaming = False
http_client = httpx.Client(verify=False)
available_models = [
    "mixtral-8x7b-instruct-v01", 
    "gemma-7b-it", 
    "mistral-7b-instruct-v02", 
    "llama-2-70b-chat", 
    "phi-3-mini-128k-instruct", 
    "llama-3-8b-instruct"]

In [23]:
import sys
import json
from yachalk import chalk
from langchain_openai import ChatOpenAI,OpenAI
llm = OpenAI(
    base_url=base_url,
    model=available_models[0],
    http_client=http_client,
    api_key=api_key
)

# Creating all the utility functions

In [24]:


# Append the parent directory to the system path
sys.path.append("..")

# Initialize the ChatOpenAI client

def trim_incomplete_json(json_string):
    # Find the last occurrence of '}]' or '},' in the string
    last_complete = max(json_string.rfind('}]'), json_string.rfind('},'))
    
    if last_complete != -1:
        # If found, trim the string to that point and add closing bracket if needed
        trimmed = json_string[:last_complete+1]
        if not trimmed.endswith(']'):
            trimmed += ']'
        return trimmed
    else:
        # If no complete object found, return empty list
        return '[]'

def extract_concepts(prompt: str, metadata: dict = {}) -> list:
    SYS_PROMPT = (
        "Your task is to extract the key concepts (and non-personal entities) mentioned in the given context. "
        "Extract only the most important and atomistic concepts, breaking them down into simpler concepts if needed. "
        "Categorize the concepts into one of the following categories: "
        "[import statement, concept, function definition, object-calling, document, class-definition, condition, misc].\n"
        "Format your output as a list of JSON objects in the following format:\n"
        "[\n"
        "   {\n"
        '       "entity": "The Concept",\n'
        '       "importance": "The contextual importance of the concept on a scale of 1 to 5 (5 being the highest)",\n'
        '       "category": "The Type of Concept"\n'
        "   },\n"
        "   {...}\n"
        "]"
    )

    messages = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": prompt}
    ]
    
    response = llm.invoke(input=messages)
    print("Extract Prompt ", response)
    try:
        result = json.loads(response)
    except json.JSONDecodeError:
        print("\n\nWARNING ### Incomplete JSON detected. Attempting to trim...")
        trimmed_response = trim_incomplete_json(response)
        print(trimmed_response+"\n#####################################################################################################")
        try:
            result = json.loads(trimmed_response)
        except json.JSONDecodeError:
            print("\n\nERROR ### Failed to parse even after trimming. Here is the buggy response: ", response, "\n\n")
            return None

    if result is not None:
        result = [dict(item, **metadata) for item in result]

    return result

def graph_prompt(input_text: str, metadata: dict = {}) -> list:
    SYS_PROMPT = (
        "You are a network graph maker who extracts terms and their relations from a given context. "
        "You are provided with a context chunk (delimited by ```). Your task is to extract the ontology "
        "of terms mentioned in the given context. These terms should represent the key concepts according to the context.\n"
        "Thought 1: While traversing through each sentence, think about whether Data is being passed to it\n"
        "\tTerms may include object creation, entity, class definition, import file, function signature, \n"
        "\tcondition, parameters, documents, service, concept, etc.\n"
        "\tTerms should be as concise as possible.\n\n"
        "Thought 2: Think about how these terms can have one-on-one relations with other terms.\n"
        "\tTerms mentioned in the same code or file are typically related to each other.\n"
        "\tTerms can be related to many other terms.\n\n"
        "Thought 3: Determine the relation between each related pair of terms.\n\n"
        "Format your output as a list of JSON objects. Each element of the list contains a pair of terms do not provide an explanation, JUST THE JSON OUTPUT "
        "and the relationship between them, as follows:\n"
        "[\n"
        "   {\n"
        '       "node_1": "A concept from the extracted ontology",\n'
        '       "node_2": "A related concept from the extracted ontology",\n'
        '       "edge": "The relationship between node_1 and node_2 in one or two sentences"\n'
        "   },\n"
        "   {...}\n"
        "]"
    )

    USER_PROMPT = f"context: ```{input_text}``` \n\n output: "

    messages = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": USER_PROMPT}
    ]

    response = llm.invoke(input=messages)
    # print("Graph Prompt ", response)
    try:
        result = json.loads(response)
    except json.JSONDecodeError:
        # print("\n\nWARNING ### Incomplete JSON detected. Attempting to trim...")
        trimmed_response = trim_incomplete_json(response)
        # print(trimmed_response)
        # print("################################################################################################################")
        try:
            result = json.loads(trimmed_response)
        except json.JSONDecodeError:
            print("\n\nERROR ### Failed to parse even after trimming. Here is the buggy response: ")
            return None

    if result is not None:
        result = [dict(item, **metadata) for item in result]

    return result

## Dataframe and graph manipulation

In [25]:
import uuid
import pandas as pd
import numpy as np


def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df


def df2ConceptsList(dataframe: pd.DataFrame) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        lambda row: extract_concepts(
            row.text, {"chunk_id": row.chunk_id, "type": "concept"}
        ),
        axis=1,
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list


def concepts2Df(concepts_list) -> pd.DataFrame:
    ## Remove all NaN entities
    concepts_dataframe = pd.DataFrame(concepts_list).replace(" ", np.nan)
    concepts_dataframe = concepts_dataframe.dropna(subset=["entity"])
    concepts_dataframe["entity"] = concepts_dataframe["entity"].apply(
        lambda x: x.lower()
    )

    return concepts_dataframe


def df2Graph(dataframe: pd.DataFrame, model=None) -> list:
    total_rows = len(dataframe)
    processed_rows = 0
    start_time = time.time()

    def process_row(row):
        nonlocal processed_rows
        result = graph_prompt(row.text, {"chunk_id": row.chunk_id})
        processed_rows += 1
        elapsed_time = time.time() - start_time
        avg_time_per_row = elapsed_time / processed_rows
        estimated_time_remaining = (total_rows - processed_rows) * avg_time_per_row

        print(f"\rProcessing: {processed_rows}/{total_rows} rows | "
              f"Elapsed: {elapsed_time:.2f}s | "
              f"Estimated time remaining: {estimated_time_remaining:.2f}s", 
              end="", flush=True)
        return result

    results = dataframe.apply(process_row, axis=1)

    print("\nProcessing complete!")
    print(results)

    # Filter out None values and flatten the list of lists to one single list of entities.
    concept_list = [item for sublist in results if sublist is not None for item in sublist]
    return concept_list

def graph2Df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe

In [26]:

df = documents2Dataframe(pages)
print(len(df))

300


# Actual processing

In [27]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df)
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    dfg1.to_csv(output_dir/"graph.csv", sep="|", index=False)
    df.to_csv(output_dir/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(output_dir/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)

Processing: 31/300 rows | Elapsed: 565.73s | Estimated time remaining: 4909.09s

ERROR ### Failed to parse even after trimming. Here is the buggy response: 
Processing: 41/300 rows | Elapsed: 736.18s | Estimated time remaining: 4650.53s

ERROR ### Failed to parse even after trimming. Here is the buggy response: 
Processing: 44/300 rows | Elapsed: 805.86s | Estimated time remaining: 4688.66s

ERROR ### Failed to parse even after trimming. Here is the buggy response: 
Processing: 102/300 rows | Elapsed: 1839.07s | Estimated time remaining: 3569.96s

ERROR ### Failed to parse even after trimming. Here is the buggy response: 
Processing: 108/300 rows | Elapsed: 1955.50s | Estimated time remaining: 3476.45s

ERROR ### Failed to parse even after trimming. Here is the buggy response: 
Processing: 133/300 rows | Elapsed: 2352.10s | Estimated time remaining: 2953.39s

ERROR ### Failed to parse even after trimming. Here is the buggy response: 
Processing: 163/300 rows | Elapsed: 2853.13s | Estim

## connecting node with more contextual proximity

adding count to the edges to design strength

In [28]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    
    dfg_long.drop(columns=["variable"], inplace=True)

    
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))

    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)

    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
5896,void,verifytimebasedpayload,"96448546d897479e89f7743755655ece,96448546d8974...",4,contextual proximity
5897,void **state,libspdm_context_t,"15db715acae549fbaba3d8821adb4099,15db715acae54...",2,contextual proximity
5899,void **state,libspdm_test_context_t,"904ffa7eddec45fa9bfe1bd7b22700ad,15db715acae54...",3,contextual proximity
5900,void **state,libspdm_test_responder_finish_case10,"904ffa7eddec45fa9bfe1bd7b22700ad,904ffa7eddec4...",3,contextual proximity
5905,void *data,verifytimebasedpayloadandupdate,"a2b41c75af5341e5b0c712bb28ae1c95,a2b41c75af534...",4,contextual proximity


In [29]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,*shadigest,uint8,"5a5c49109d3e40bba1b720c96851b5c1,5a5c49109d3e4...",contextual proximity,5
1,*shadigest,uintn,"5a5c49109d3e40bba1b720c96851b5c1,5a5c49109d3e4...",contextual proximity,2
2,*signercert,uint8,"5a5c49109d3e40bba1b720c96851b5c1,5a5c49109d3e4...",contextual proximity,5
3,*signercert,uintn,"5a5c49109d3e40bba1b720c96851b5c1,5a5c49109d3e4...",contextual proximity,2
4,*tbscert,uint8,"5a5c49109d3e40bba1b720c96851b5c1,5a5c49109d3e4...",contextual proximity,5
...,...,...,...,...,...
3316,void,verifytimebasedpayload,"96448546d897479e89f7743755655ece,96448546d8974...",contextual proximity,4
3317,void **state,libspdm_context_t,"15db715acae549fbaba3d8821adb4099,15db715acae54...",contextual proximity,2
3318,void **state,libspdm_test_context_t,"904ffa7eddec45fa9bfe1bd7b22700ad,15db715acae54...",contextual proximity,3
3319,void **state,libspdm_test_responder_finish_case10,"904ffa7eddec45fa9bfe1bd7b22700ad,904ffa7eddec4...",contextual proximity,3


In [30]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(860,)

# Drawing the graph

In [31]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

In [32]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  39
[['(efi_signature_data *)((uint8 *)dbxlist + siglistheadersize)', 'certhash'], ['*shadigest', '*signercert', '*tbscert', '*toplevelcert', 'acpi method', 'aead_cipher_suite', 'algorithm', 'allocatepool', 'allocatezeropool', 'allocationsize', 'api', 'app_message', 'app_message_size', 'application_secret', 'asb', 'assert_int_equal', 'assert_non_null', 'ata_pass_thru_protocol', 'atacommandblock', 'atapassthru', 'atapassthru->passthru', 'atapassthrucommandpacket', 'atapassthrupassthru', 'atapassthruprotocol', 'attributes', 'authdata', 'authdatasize', 'authenticated data', 'authentication pass', 'authenticationstatus', 'authserviceinternalfindvariable', 'authserviceinternalupdatevariablewithtimestamp', 'authsession', 'authstate', 'authvariableinfo.attributes', 'authvartype', 'authvartypekek', 'authvartypepk', 'base', 'base_asym_algo', 'base_hash_algo', 'block i/o device', 'blockcount', 'blockio', 'bool', 'boolean', 'buffer', 'bufsize', 'bus', 'busy', 'calculatepri

In [33]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)

In [34]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [35]:
import networkx as nx
from pyvis.network import Network


k_value = 2  # Adjust this value to change node spacing
pos = nx.spring_layout(G, k=k_value, iterations=50)


net = Network(notebook=False, cdn_resources="remote", height="1200px", width="1200px")


for node, (x, y) in pos.items():
    net.add_node(
        node,
        x=x * 200, 
        y=y * 200,
        physics=True,  
        **G.nodes[node] 
    )

# Add edges to the Pyvis network
for source, target, edge_attrs in G.edges(data=True):
    edge_data = edge_attrs.copy()
    edge_data['value'] = edge_data['weight']
    net.add_edge(
        source,
        target,
        **edge_data
    )

# Disable physics in Pyvis to maintain the NetworkX layout
net.toggle_physics(True)

# Save the network
net.show("./docs/index.html", notebook=False)

./docs/index.html
