## Setup

In [1]:
print("yo")

yo


In [2]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}/")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [3]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)
# pages=pages[:1]


100%|██████████| 1/1 [00:06<00:00,  6.12s/it]

Number of chunks =  56
Early diencephalon development in birds (chick) and mam- mals (humans) follows a similar pattern. Specifically, a single diencephalic compartment divides into two zones: the par- encephalon and synencephalon. Subsequently, the paren- cephalon becomes subdivided into an anterior and poste- rior unit. Some studies, including the present one, have

Received: December 19, 2006

Returned for revision: March 6, 2007

Accepted after revision: May 9, 2007

Published online: September 20, 2007

M.B. Pritz

Department of Neurological Surgery

Indiana University School of Medicine, 545 Barnhill Drive, Emerson 141

Indianapolis, IN 46202

5124 (USA)

Tel. +1 317 274 5728, Fax. +1 317 274 7351, E-Mail mpritz@iupui.edu

© 200  S. Karger AG, Basel

0006–8977/08/0711–0015$24.50/0

Accessible online at:

www.karger.com/bbe

7

Pritz

Brain Behav Evol 2008;71:15–31 16 have questioned whether such a phylotypic stage exists for body plans in vertebrates [Richardson et al., 1997]. It




## Create a dataframe of all the chunks

In [4]:
from helpers.df_helpers import documents2Dataframe

df = documents2Dataframe(pages)
def clean_text(text):
    # Replace newlines followed by letters with a space
    if len(text) > 1:
        cleaned_text = ''.join([text[i] if not (text[i] == '\n' and text[i+1].isalnum()) else ' ' for i in range(len(text)-1)]) + text[-1]
    else:
        cleaned_text = text

    # Remove remaining newlines
    cleaned_text = cleaned_text.replace('\n', ' ')

    # Replace multiple spaces with a single space
    cleaned_text = ' '.join(cleaned_text.split())

    # Remove unwanted special characters
    allowed_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,;:!?\"'()-")
    cleaned_text = ''.join([char for char in cleaned_text if char in allowed_chars])

    return cleaned_text

# Apply the clean_text function to the 'content' column of the DataFrame
df['text'] = df['text'].apply(clean_text)


print(df.shape)
print(df.head())


(56, 3)
                                                text  \
0  See discussions, stats, and author profiles fo...   
1  Copyright  2007 S. Karger AG, Basel Introducti...   
2  Key Words Alligator mississipiensis ! Developm...   
3  Early diencephalon development in birds (chick...   
4  Most developmental studies have focused on jus...   

                                      source                          chunk_id  
0  data_input\cureus\Dien.DevelopBBE2008.txt  b64a9052b7b54a8bac6f13bdeea4ccae  
1  data_input\cureus\Dien.DevelopBBE2008.txt  3d8adf9ff2f44cc887e5df05bca6f5f8  
2  data_input\cureus\Dien.DevelopBBE2008.txt  302e8070fe1d44a39a3f4a93b7a89957  
3  data_input\cureus\Dien.DevelopBBE2008.txt  c388e84508424260a917b0d5f0a8d565  
4  data_input\cureus\Dien.DevelopBBE2008.txt  369b3304c3784410a966b5fc1a8e03df  


## Extract Concepts

In [5]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [6]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model="llama3:latest")
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  


[
    {
        "node_1": "Alligator",
        "node_2": "Early Diencephalon Development",
        "edge": "describes the development of"
    },
    {
        "node_1": "Diencephalon",
        "node_2": "Alligator",
        "edge": "in the context of its early development"
    },
    {
        "node_1": "Synencephalon",
        "node_2": "Anterior and Posterior Component",
        "edge": "is further parcellated into"
    },
    {
        "node_1": "Reptiles",
        "node_2": "Alligator",
        "edge": "in the context of early diencephalon development"
    },
    {
        "node_1": "Birds",
        "node_2": "Chick",
        "edge": "and mammals (humans) also exhibit similar patterns"
    },
    {
        "node_1": "Amniotes",
        "node_2": "Early Diencephalon Development",
        "edge": "share a similar initial pattern of development"
    }
][
   {
       "node_1": "forebrain",
       "node_2": "vertebrates

KeyboardInterrupt: 

## Calculating contextual proximity

In [None]:
# def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
#     ## Melt the dataframe into a list of nodes
#     dfg_long = pd.melt(
#         df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
#     )
#     dfg_long.drop(columns=["variable"], inplace=True)
#     # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
#     dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
#     # drop self loops
#     self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
#     dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
#     ## Group and count edges.
#     dfg2 = (
#         dfg2.groupby(["node_1", "node_2"])
#         .agg({"chunk_id": [",".join, "count"]})
#         .reset_index()
#     )
#     dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
#     dfg2.replace("", np.nan, inplace=True)
#     dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
#     # Drop edges with 1 count
#     dfg2 = dfg2[dfg2["count"] != 1]
#     dfg2["edge"] = "contextual proximity"
#     return dfg2


# dfg2 = contextual_proximity(dfg1)
# dfg2.tail()

### Merge both the dataframes

In [None]:
# dfg = pd.concat([dfg1, dfg2], axis=0)
dfg=dfg1
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,abbreviations used in figures,acetyl tub,a069c2b3bb334c1fb53930c7b5c37bf4,used for,4
1,abbreviations used in figures,ache,a069c2b3bb334c1fb53930c7b5c37bf4,used for,4
2,abbreviations used in figures,cv,a069c2b3bb334c1fb53930c7b5c37bf4,used for,4
3,abbreviations used in figures,d,a069c2b3bb334c1fb53930c7b5c37bf4,used for,4
4,acetylated tubulin,diencephalon-mesencephalon border,513e045899fc4064bb3790bfa5458508,immunoreactivity positive,4
...,...,...,...,...,...
357,zona limitans interparencephalica,parencephalon,d94bb4a6b2e74ee4ad662a201f0b541b,separates into anterior and posterior divisions,4
358,zona limitans interparencephalica,parencephalon anterior,"708c27f976ef4120bfa867226b6a49b7,831684506fb74...","A band, the zona limitans interparencephalica,...",12
359,zona limitans interparencephalica,parencephalon posterior,"831684506fb74052b859de9c36321584,03d83c94e7b44...",The zona limitans interparencephalica separate...,8
360,zona limitans interparencephalica,zona limitans intrathalamica,f1fa2dacb94c413e906eca383de62bd2,might correspond to part or all of,4


## Calculate the NetworkX Graph

In [None]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(433,)

In [None]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [None]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  114
[["'-tubulin", 'micro-tubules'], ['abbreviations used in figures', 'acetyl tub', 'ache', 'cv', 'd'], ['acetylated tubulin', 'calretinin', 'cells', 'diencephalon-mesencephalon border', 'longitudinal fibers', 'microtubules'], ['acetylated tubulin immunoreactivity', 'longitudinal division'], ['acetylcholine-positive neuroblasts', 'alligator', 'alligator mississipiensis', 'american alligator', 'amniotes', 'avian', 'bat', 'behav evol', 'brain', 'brains', 'cell proliferation', 'chick vaage, 1969', 'compartment development', 'd1and d2', 'd2 compartment mller and orahilly, 1997', 'developing diencephalon', 'development', 'developmentally significant criteria', 'diencephalic development', 'diencephalic segmentation', 'diencephalic subdivisions', 'diencephalon', 'diencephalon development', 'division', 'dlx-1', 'dlx-2', 'dorso-ventral axes', 'dorsoventral', 'early diencephalon development in alligator', 'early stages of development', 'events', 'evol', 'evolution', 'ex

### Create a dataframe for community colors

In [None]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,'-tubulin,#db57d3,1
1,micro-tubules,#db57d3,1
2,abbreviations used in figures,#dbb957,2
3,acetyl tub,#dbb957,2
4,ache,#dbb957,2
...,...,...,...
428,synencephalon posterior (d4),#dbd557,112
429,teleosts,#57db90,113
430,zebrafish,#57db90,113
431,tissue sectioned transversally,#db57b7,114


### Add colors to the graph

In [None]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [None]:
from pyvis.network import Network
import webbrowser

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    filter_menu=False,
)

# Assuming G is your networkx graph
net.from_nx(G)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
net.show_buttons(filter_=["physics"])

# Generate HTML content
html_content = net.generate_html()

# Write the HTML content with UTF-8 encoding
with open(graph_output_directory, 'w', encoding='utf-8') as file:
    file.write(html_content)

# Optionally open the file in the browser
webbrowser.open(graph_output_directory)


True