## Load the Data

In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "jpetstore"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")



## Create Pages

In [2]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of pages = ", len(pages))


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.69s/it]

Number of pages =  57





## Create a dataframe of all the pages

In [3]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(57, 3)


Unnamed: 0,text,source,chunk_id
0,Found 57 source code files: ../jpetstore/src/W...,data_input/jpetstore/jpetstore.txt,f0a53e80a16644a8bb429157dd336adc
1,return address2;\n\n}\n\npublic void setAddres...,data_input/jpetstore/jpetstore.txt,27da81d187e14be398e34cc28f90cd0d
2,../jpetstore/src/WEB\n\nINF/classes/com/ibatis...,data_input/jpetstore/jpetstore.txt,031bed0aea024e1a97917d1cba57994e
3,"public void setQuantityByItemId(String itemId,...",data_input/jpetstore/jpetstore.txt,96fd3b1d7da24168992fa71cf364d4be
4,}\n\npublic void setCategoryId(String category...,data_input/jpetstore/jpetstore.txt,844e66ec40364951b1e1b3e09bf8afa7


## Extract Concepts

In [4]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [5]:
#create chunks
#df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
# chunk size is the number of rows
chunksize=32
list_df = [df[i:i+chunksize] for i in range(0,len(df),chunksize)]
print(len(list_df))

##Test forloop filename
for i in range(len(list_df)):
    graph_file = "graph_" + str(i) + ".csv"
    #print(graph_file)
    outputfile = Path(f"./data_output/{out_dir}/{graph_file}")

    
   # print(outputfile)
    

2


Generate New Knowledge Graph
=====================

In [None]:
##create chunks
#df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
# chunk size is the number of rows
chunksize=32
list_df = [df[i:i+chunksize] for i in range(0,len(df),chunksize)]
print(len(list_df))
for i in range(len(list_df)):
#for i in range(51,53):
    df = list_df[i]
    graph_file = "graph_" + str(i) + ".csv"
    chunks_file = "chunks_" + str(i) + ".csv"
    outputfile_graph = Path(f"./data_output/{out_dir}/{graph_file}")
    outputfile_chunks = Path(f"./data_output/{out_dir}/{chunks_file}")

    #print(len(df))
    concepts_list = df2Graph(df, model='zephyr:latest')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputfile_graph, sep="|", index=False)
    df.to_csv(outputfile_chunks,sep="|", index=False)
    dfg1.to_csv(outputdirectory/"graph.csv", mode='a', sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", mode='a',sep="|", index=False)

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()
    

2
[
   {
       "node_1": "Account",
       "node_2": "userId",
       "edge": "String userId; represents the unique identifier for a user in this application's domain."
   },
   {
       "node_1": "Account",
       "node_2": "email",
       "edge": "String email; holds the registered email address of the user."
   },
   {
       "node_1": "Account",
       "node_2": "firstName",
       "edge": "String firstName; stores the given name of the user."
   },
   {
       "node_1": "Account",
       "node_2": "lastName",
       "edge": "String lastName; contains the family name or surname of the user."
   },
   {
       "node_1": "Account",
       "node_2": "status",
       "edge": "String status; indicates the current state or condition of the user's account in this system."
   },
   {
       "node_1": "Account",
       "node_2": "address1",
       "edge": "String address1; holds the primary street address or house number of the user's residence."
   },
   {
       "node_1": "Account",
    

Use Existing Knowledge Graph
=====================

In [6]:
## To regenerate the graph with LLM, set this to True
regenerate = False


if regenerate:
    concepts_list = df2Graph(df, model='zephyr:latest')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|", on_bad_lines='skip')

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()
dfg1.tail()

(916, 9)


Unnamed: 0,node_1,node_2,edge,chunk_id,node_3,node_4,edge_1,edge_2,count
918,languages,accountform.getlanguages(),Can be accessed using the getter method of Acc...,8be17bec52764f38828e8bdf8d21c85a,,,,,4
919,setlanguages,accountform,Used to set the value of languages property of...,8be17bec52764f38828e8bdf8d21c85a,,,,,4
920,categories,accountform.getcategories(),Can be accessed using the getter method of Acc...,8be17bec52764f38828e8bdf8d21c85a,,,,,4
921,setcategories,accountform,Used to set the value of categories property o...,8be17bec52764f38828e8bdf8d21c85a,,,,,4
922,validate,accountform.getvalidate(),Can be accessed using the getter method of Acc...,8be17bec52764f38828e8bdf8d21c85a,,,,,4


## Calculating contextual proximity

In [7]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)

### Merge both the dataframes

In [8]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,'com.ibatis.db.dao',basemapdao,"727c8467fbb54c1782931a63326cdb50,727c8467fbb54...",contextual proximity,2
1,'com.ibatis.db.dao',categorydao,"727c8467fbb54c1782931a63326cdb50,727c8467fbb54...",contextual proximity,4
2,'com.ibatis.db.dao',getnextid method of class 'categorymapdao',"727c8467fbb54c1782931a63326cdb50,727c8467fbb54...",contextual proximity,3
3,'com.ibatis.db.dao',sequence class,"727c8467fbb54c1782931a63326cdb50,727c8467fbb54...",contextual proximity,2
4,'com.ibatis.db.dao.impl.abstractbasesqlmapper',basemapdao,"727c8467fbb54c1782931a63326cdb50,727c8467fbb54...",contextual proximity,2
...,...,...,...,...,...
7254,zip,setzip,1d1530ecebdc4c48bf829539106d365d,The Cart class has a member variable named zip...,4
7255,zip,state,"1d1530ecebdc4c48bf829539106d365d,3ebda43775704...",contextual proximity,4
7256,zip,status,"3ebda43775704e379e4792a1639f8e14,28c4cf26aeff4...",contextual proximity,3
7257,zip,userid,"28c4cf26aeff4a75b7dff80bdb6f42ad,24481914343a4...",contextual proximity,2


## Calculate the NetworkX Graph

In [9]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(552,)

In [10]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [11]:
communities_generator = nx.community.girvan_newman(G)
#communities_generator = nx.community.louvain_communities(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  5
[["'com.ibatis.db.dao'", "'com.ibatis.db.dao.impl.abstractbasesqlmapper'", "'com.ibatis.db.sqlmap.client.sqlmapclient'", 'accountdao', 'accountform', 'accountform.getpassword()', 'accountform.getusername()', 'acctform', 'acctform.getaccount().getfavouritecategoryid()', 'action', 'actionerrors errors', 'actionform', 'actionmapping', 'actionmapping mapping', 'actionservlet', 'actionsupport', 'additemtocartaction', 'american express', 'attribute1', 'attribute2', 'attribute3', 'attribute4', 'attribute5', 'baseaction', 'baseaction.daomanager', 'basecachedao', 'baseform', 'basemapdao', 'boolean', 'cache', 'cachechannel', 'cacheing', 'card_type_list', 'cartform', 'cartform.getcart().additem(item)', 'cartform.getcart().containsitemid(cartform.getworkingitemid())', 'cartform.getcart().incrementquantitybyitemid(cartform.getworkingitemid())', 'cartform.getworkingitemid()', 'cartitem', 'cartitems', 'categories', 'category', 'categorydao', 'categoryform', 'categoryid', 'c

### Identify services

In [None]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Topic
from helpers.df_helpers import topics2Df
import re

def clean_list(my_list) -> list:
    cleaned_list = [re.sub(r'[^A-Za-z0-9]+', ' ', ' '.join(item.strip("'") for item in element)) for element in my_list]
    return cleaned_list
    
topics = clean_list(communities)
#print(topics)
df_topics = pd.DataFrame(topics)
df_topics.rename(columns={0: 'text'}, inplace=True)
#df_topics.head()

topic_list = df2Topic(df_topics, model='zephyr:latest')

#dfg1 = graph2Df(concepts_list1)

#print("done")



#topic_list = df2Topic(df_communites, model='zephyr:latest')
#topics = topics2Df(topic_list)




In [None]:
#Identify services 

#print(len(df))
# creates a concept list graph from the dataframe feed in and uses the zephyr model to do it. 
    concepts_list = df2Graph(df, model='zephyr:latest')

    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)


# take the communities from above. 
        
topics = classify_topic_groups(communities)
#This is broken again....
print(topics)


### Create a dataframe for community colors

In [53]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,'com.ibatis.db.dao',#5f57db,1
1,'com.ibatis.db.dao.impl.abstractbasesqlmapper',#5f57db,1
2,'com.ibatis.db.sqlmap.client.sqlmapclient',#5f57db,1
3,accountdao,#5f57db,1
4,accountform,#5f57db,1
...,...,...,...
366,setshiptolastname,#db57d3,6
367,settotalprice,#db57d3,6
368,shiptofirstname,#db57d3,6
369,shiptolastname,#db57d3,6


### Add colors to the graph

In [54]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [20]:
from pyvis.network import Network

#graph_output_directory = "./docs/index.html"
graph_output_file = Path(f"{out_dir}.html")

graph_output_directory = Path(f"./data_output/{out_dir}/{graph_output_file}")
print(graph_output_directory)

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

#net.show(graph_output_directory, notebook=False)

data_output/logjpetstore/logjpetstore.html


In [15]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
