## Setup

In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [2]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


  0%|          | 0/1 [00:00<?, ?it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
100%|██████████| 1/1 [00:04<00:00,  4.31s/it]

Number of chunks =  31
using stochasticity to improve the robustness of SNN has be-

come a feasible solution. Sharmin et al. stated that Poisson

coding sampling data from a Bernoulli distribution could

help SNN improve robustness (Sharmin et al. 2020). This

method has been applied to text classification (Lv, Xu, and

Zheng 2023) and sparked a discussion about the robustness

of the encoding methods (Kim et al. 2022). At the same time,

Li et al. believed that SNNs with inherent noise are more ro-

bust to input noise than ANNs (Li et al. 2020). The robust-

ness verification of the above works is mainly carried out

experimentally, lacking a theoretical explanation. For bio-

logical systems, the role of noise is more complex. Faisal et

al. reviewed the function of noise at the cellular level. How

much these noises contribute beneficially to neuronal pro-

cessing is the fundamental problem of neural coding (Faisal,

Selen, and Wolpert 2008). Thus, how to introduce meaning-

ful 




In [3]:
import importlib.util
import sys
import os

# 添加项目路径到sys.path
project_path = 'D:/python/知识图谱'
if project_path not in sys.path:
    sys.path.append(project_path)

# 加载 df_helpers 模块
# module_path = os.path.join(project_path, 'helpers', 'df_helpers.py')
module_path =  os.path.join(project_path, 'helpers', 'df_helpers.py')
spec = importlib.util.spec_from_file_location("helpers.df_helpers", module_path)
df_helpers = importlib.util.module_from_spec(spec)
spec.loader.exec_module(df_helpers)

# 使用模块中的函数
#documents2Dataframe = df_helpers.documents2Dataframe
print("Module imported successfully")


Module imported successfully


## Create a dataframe of all the chunks

In [4]:

from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(31, 3)


Unnamed: 0,text,source,chunk_id
0,AbstractSpiking neural networks (SNNs) exploit...,data_input\cureus\6.txt,8762f14295804af3b88c4da1f3d7a883
1,1997; Zenke et al. 2021; Xu et al. 2022; Shen ...,data_input\cureus\6.txt,43ef40bc8b384521b0f72c0cec3a3a31
2,"mazaki et al. 2022), where reliable perception...",data_input\cureus\6.txt,60ff2962c8e149e197e751fcba4b57d1
3,using stochasticity to improve the robustness ...,data_input\cureus\6.txt,d1b7b85d1d934a3dada6948360953dae
4,contributions are summarized as follows:\n\nWe...,data_input\cureus\6.txt,0443abbf08d7441d84a86cb011eda23c


## Extract Concepts

In [5]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [6]:
## To regenerate the graph with LLM, set this to True
regenerate = False

if regenerate:
    concepts_list = df2Graph(df)
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"3_graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"3_chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"2_graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

(319, 5)


Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,intuitive reasoning network (irene),neural model,IRENE is a neural model for intuitive psycholo...,20798c58a9b94c11b981f2638d54af69,4
1,intuitive reasoning network (irene),graph neural network,IRENE combines a graph neural network for lear...,20798c58a9b94c11b981f2638d54af69,4
2,intuitive reasoning network (irene),transformer,IRENE combines a transformer to encode the tas...,20798c58a9b94c11b981f2638d54af69,4
3,irene,baby intuitions benchmark,IRENE achieves new state-of-the-art performanc...,20798c58a9b94c11b981f2638d54af69,4
4,irene,agents,IRENE is able to bind preferences to specific ...,20798c58a9b94c11b981f2638d54af69,4


## Calculating contextual proximity

In [7]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
6312,weihs et al.,riochet et al.,"4d0889b0ea3f453391eabf33d908578c,4d0889b0ea3f4...",2,contextual proximity
6313,weihs et al.,shu et al.,"4d0889b0ea3f453391eabf33d908578c,4d0889b0ea3f4...",2,contextual proximity
6334,worse scores,local directional relations,"071112cd95e94bb7bf3ff045a0ce6730,071112cd95e94...",3,contextual proximity
6338,worse scores,performance,"071112cd95e94bb7bf3ff045a0ce6730,071112cd95e94...",3,contextual proximity
6339,worse scores,remote relations,"071112cd95e94bb7bf3ff045a0ce6730,071112cd95e94...",2,contextual proximity


### Merge both the dataframes

In [8]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,2d videos,agent,"5b8f1c8378164d7ca9cdd5009fe37959,5b8f1c8378164...",contextual proximity,4
1,2d videos,colour,"5b8f1c8378164d7ca9cdd5009fe37959,5b8f1c8378164...",contextual proximity,2
2,2d videos,features,"5b8f1c8378164d7ca9cdd5009fe37959,5b8f1c8378164...",contextual proximity,4
3,2d videos,frames,"5b8f1c8378164d7ca9cdd5009fe37959,5b8f1c8378164...",contextual proximity,2
4,2d videos,graphs,"5b8f1c8378164d7ca9cdd5009fe37959,5b8f1c8378164...",contextual proximity,3
...,...,...,...,...,...
2563,weihs et al.,shu et al.,"4d0889b0ea3f453391eabf33d908578c,4d0889b0ea3f4...",contextual proximity,2
2564,winogrande: an adversarial winograd schema cha...,communications of the acm,c0b80dd2f5e44ceeae98b45ab138ca77,Published in,4
2565,worse scores,local directional relations,"071112cd95e94bb7bf3ff045a0ce6730,071112cd95e94...",contextual proximity,3
2566,worse scores,performance,"071112cd95e94bb7bf3ff045a0ce6730,071112cd95e94...",contextual proximity,3


## Calculate the NetworkX Graph

In [9]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(340,)

In [10]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [11]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  44
[['12 months of age', 'intentional stance'], ['2d videos', 'a. bulling', 'ablation studies', 'actions', 'adjacent and aligned', 'agent', 'agent and world state encoding', 'agent tests', "agent's past trajectories", 'agents', 'agents’ goals', 'agents’ preferences', 'ai capabilities', 'architecture', 'attention heads', 'baby intuitions benchmark', 'baby intuitions benchmark (bib)', 'bail-largeon 1987', 'bayesian theory of mind', 'bc-mlp', 'bc-rnn', 'benchmarks', 'bib', 'bib task', 'bipack', 'blocking barriers', 'blocking obstacles', 'coarse probabilistic object representations', 'collaborative agents', 'colour', 'common-sense reasoning', 'common-sense reasoning benchmarks', 'common-sense reasoning tasks', 'complementary basic concepts', 'complex tasks', 'context embedding', 'context embeddings', 'context encoder', 'core psychological reasoning', 'ctx token', 'dasgupta et al.', 'designing and creating new benchmarks', 'deutsche forschungsgemeinschaft', 'develop

### Create a dataframe for community colors

In [12]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,12 months of age,#db5785,1
1,intentional stance,#db5785,1
2,2d videos,#578bdb,2
3,a. bulling,#578bdb,2
4,ablation studies,#578bdb,2
...,...,...,...
335,social iqa: commonsense reasoning about social...,#db5f57,42
336,robotics: science and systems workshop on soci...,#a3db57,43
337,solving the baby intuitions benchmark with a h...,#a3db57,43
338,"shazeer, n.",#91db57,44


### Add colors to the graph

In [13]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [14]:
from pyvis.network import Network

graph_output_directory = "./docs/3_index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])
html_content = net.generate_html()

# Write the HTML content to a file with UTF-8 encoding
with open(graph_output_directory, 'w', encoding='utf-8') as f:
    f.write(html_content)
# net.show(graph_output_directory, notebook=False)

In [15]:
import pandas as pd

# 保存节点信息
nodes_data = []
for node in G.nodes(data=True):
    nodes_data.append({
        'node': node[0],
        'group': node[1].get('group'),
        'color': node[1].get('color'),
        'size': node[1].get('size')
    })

df_nodes = pd.DataFrame(nodes_data)
df_nodes.to_csv("nodes_data.csv", index=False)

# 保存边信息
edges_data = []
for edge in G.edges(data=True):
    edges_data.append({
        'node_1': edge[0],
        'node_2': edge[1],
        'weight': edge[2].get('weight')
    })

df_edges = pd.DataFrame(edges_data)
df_edges.to_csv("edges_data.csv", index=False)
communities_data = []
for group, community in enumerate(communities, 1):
    for node in community:
        communities_data.append({'node': node, 'group': group})

df_communities = pd.DataFrame(communities_data)
df_communities.to_csv("communities_data.csv", index=False)
