## Setup

In [1]:
print("yo")

yo


In [2]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}/")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [3]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print(pages)


100%|██████████| 1/1 [00:02<00:00,  2.63s/it]

[Document(metadata={'source': 'data_input\\cureus\\Dien.DevelopBBE2008-pages.txt'}, page_content='Fax +41 61 306 12 34 E-Mail karger@karger.ch www.karger.com\n\nOriginal Paper\n\nBrain Behav Evol 2008;71:15–31\n\nDOI: 10.1159/000108608\n\nEarly Diencephalon Development in\n\nAlligator\n\nMichael B. Pritz\n\nDepartment of Neurological Surgery, Indiana University School of Medicine, Indianapolis, Ind. , USA noted further parcellation of the synencephalon into an an- terior and posterior component, whereas others have not. Notwithstanding differe     nces as to whether the synencepha- lon is a single unit or not, these detailed analyses in reptiles\n\n(Alligator) , birds (chick), and mammals (humans), suggest that the initial pattern of early diencephalon development in amniotes is similar.\n\nCopyright © 2007 S. Karger AG, Basel\n\nIntroduction\n\nA number of approaches have been used to unravel the organization and evolution of the forebrain in vertebrates [Nieuwenhuys, 1998b]. Varying 




## Create a dataframe of all the chunks

In [4]:
from helpers.df_helpers import documents2Dataframe

df = documents2Dataframe(pages)
print(df.shape)
def clean_text(text):
    # Replace newlines followed by letters with a space
    if len(text) > 1:
        cleaned_text = ''.join([text[i] if not (text[i] == '\n' and text[i+1].isalnum()) else ' ' for i in range(len(text)-1)]) + text[-1]
    else:
        cleaned_text = text

    # Remove remaining newlines
    cleaned_text = cleaned_text.replace('\n', ' ')

    # Replace multiple spaces with a single space
    cleaned_text = ' '.join(cleaned_text.split())

    # Remove unwanted special characters
    allowed_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,;:!?\"'()-")
    cleaned_text = ''.join([char for char in cleaned_text if char in allowed_chars])

    return cleaned_text

# Apply the clean_text function to the 'content' column of the DataFrame
df['text'] = df['text'].apply(clean_text)

print(df.head())


(9, 3)
                                                text  \
0  Fax 41 61 306 12 34 E-Mail kargerkarger.ch www...   

                                            source  \
0  data_input\cureus\Dien.DevelopBBE2008-pages.txt   

                           chunk_id  
0  30663e108b0e44ada244bf0d2fbeacd5  


## Extract Concepts

In [5]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [6]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model="llama3:latest")
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  


Here is the extracted ontology of terms and their relations in JSON format:

```
[
    {
        "node_1": "Alligator",
        "node_2": "Diencephalon Development",
        "edge": "The initial pattern of early diencephalon development in amniotes is similar, as seen in Alligator."
    },
    {
        "node_1": "Chick",
        "node_2": "Diencephalon Development",
        "edge": "Detailed analyses have been done in reptiles (Alligator), birds (chick), and mammals (humans) to study the initial pattern of early diencephalon development."
    },
    {
        "node_1": "Humans",
        "node_2": "Diencephalon Development",
        "edge": "Detailed analyses have been done in reptiles (Alligator), birds (chick), and mammals (humans) to study the initial pattern of early diencephalon development."
    },
    {
        "node_1": "Forebrain",
        "node_2": "Organization",
        "edge": "A number of approaches have been used to unravel the organization and evolution of the forebrain

ValueError: need at least one array to concatenate

## Calculating contextual proximity

In [None]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
5042,zhuang h,tsai hh,"5044cb2035684913bd12d1965ed603c9,5044cb2035684...",6,contextual proximity
5044,|log2fc| >2,3v-v vs 3v-d,"2138215950d7499baea4bf39bdd15b16,2138215950d74...",2,contextual proximity
5045,|log2fc| >2,"3v-v vs 3v-d, e","2138215950d7499baea4bf39bdd15b16,2138215950d74...",2,contextual proximity
5048,|log2fc| >2,"c, d","2138215950d7499baea4bf39bdd15b16,2138215950d74...",2,contextual proximity
5065,|log2fc| >2,"significantly up-regulated genes, sig = true","2138215950d7499baea4bf39bdd15b16,2138215950d74...",2,contextual proximity


### Merge both the dataframes

In [None]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,100 mg/kg body weight,embryonic forebrain,"879a4722e54b4b7e97cbce18440da9f7,879a4722e54b4...",contextual proximity,3
1,100 mg/kg body weight,intraperitoneal injection,879a4722e54b4b7e97cbce18440da9f7,represents the method by which tamoxifen was a...,4
2,3v progenitors,astrocyte progenitors,"c36bd013662b43d2bb71a4d173abeb2d,c36bd013662b4...",contextual proximity,3
3,3v progenitors,eyfp-expressing cells,"c36bd013662b43d2bb71a4d173abeb2d,c36bd013662b4...",labeled cells in the 3V wall showed radial gli...,7
4,3v wall,astrocytes,"8a3132a7d3ef4227b0f0981fa3445ba7,3c3cf4f29d994...","In this study, we explored the fate specificat...",20
...,...,...,...,...,...
1382,zhuang h,tsai hh,"5044cb2035684913bd12d1965ed603c9,5044cb2035684...","co-authored by,contextual proximity",10
1383,|log2fc| >2,3v-v vs 3v-d,"2138215950d7499baea4bf39bdd15b16,2138215950d74...",contextual proximity,2
1384,|log2fc| >2,"3v-v vs 3v-d, e","2138215950d7499baea4bf39bdd15b16,2138215950d74...",A threshold of |log2FC| > 2 is used to filter ...,6
1385,|log2fc| >2,"c, d","2138215950d7499baea4bf39bdd15b16,2138215950d74...",contextual proximity,2


## Calculate the NetworkX Graph

In [None]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(317,)

In [None]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [None]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  31
[['100 mg/kg body weight', 'abdominal cavity', 'abdominal fluids', 'cs-cdf-cg-pre vector', 'electrical pulses', 'embryonic forebrain', 'glass micropipette', 'hgfap-creert2;ai14 transgenic mice', 'hgfap-gfp plasmid', 'icr pregnant female mice', 'intraperitoneal injection', 'manually injected lentivirus', 'mice were perfusion-fixed', 'p14', 'plasmid dna solution', 'time points as indicated', 'warm saline', 'while the embryos were still in the uterus'], ['3v or lv', '3v progenitors', '3v wall', '3v wall at e17.5', 'ai14', 'ai14 transgenic mice', 'aldh1l1', 'aldh1l1+ cells', 'aqp4', 'aquaporin-4', 'astrocyte', 'astrocyte features in the dorsal part of the diencephalon', 'astrocyte lineage', 'astrocyte progenitors', 'astrocyte progenitors with various morphologies', 'astrocyte proliferation', 'astrocyte-like cells', 'astrocyte-like cells distributed across the entire forebrain', 'astrocyte-specific expression', 'astrocyte-specific hgfap-creert2; ai14', 'astrocyte

### Create a dataframe for community colors

In [None]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,100 mg/kg body weight,#9b57db,1
1,abdominal cavity,#9b57db,1
2,abdominal fluids,#9b57db,1
3,cs-cdf-cg-pre vector,#9b57db,1
4,electrical pulses,#9b57db,1
...,...,...,...
312,pyramidal neurons and protoplasmic astrocytes,#db579c,29
313,noctor sc,#57dba3,30
314,symmetric and asymmetric division zones,#57dba3,30
315,orderly developmental program,#6757db,31


### Add colors to the graph

In [None]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [None]:
from pyvis.network import Network
import webbrowser

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    filter_menu=False,
)

# Assuming G is your networkx graph
net.from_nx(G)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
net.show_buttons(filter_=["physics"])

# Generate HTML content
html_content = net.generate_html()

# Write the HTML content with UTF-8 encoding
with open(graph_output_directory, 'w', encoding='utf-8') as file:
    file.write(html_content)

# Optionally open the file in the browser
webbrowser.open(graph_output_directory)


KeyError: 'weight'