## Setup

In [2]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import random

# ## Input data directory
# data_dir = "cureus"
# inputdirectory = Path(f"./data_input/{data_dir}")
# ## This is where the output csv files will be written
# out_dir = data_dir
# outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [38]:
df = pd.read_csv("Birds Data.csv", index_col = 'Unnamed: 0')
df.head()

Unnamed: 0,Species,eBird Link,Info
1,Rock Pigeon,/species/rocpig/IN-PB-SH,Fairly large pigeon with wild and feral popula...
2,Eurasian Collared-Dove,/species/eucdov/IN-PB-SH,Large pale dove with a black crescent on the n...
3,Laughing Dove,/species/laudov1/IN-PB-SH,"Small, similar in size to European Turtle-Dove..."
4,Black Kite,/species/blakit1/IN-PB-SH,"Medium-sized, rather nondescript raptor with o..."
5,Rose-ringed Parakeet,/species/rorpar/IN-PB-SH,"Vibrantly bright green parakeet, frequently fo..."


In [39]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Initialize a list to store the document objects
documents = []

# Combine the relevant columns into a single text string for each row and create Document objects
for index, row in df.iterrows():
    content = f"Species: {row['Species']}\n"
    content += f"Info: {row['Info']}\n"
    documents.append(Document(page_content=content))

# Initialize the text splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

# Split the documents into chunks
chunks = splitter.split_documents(documents)

# Output the chunks
print("Number of chunks =", len(chunks))

Number of chunks = 351


In [6]:
print(f"Chunk:")
print(chunks[0].page_content)

Chunk:
Species: Rock Pigeon
Info: Fairly large pigeon with wild and feral populations throughout the world. True wild birds nest on cliffs and in caves from western Europe to central Asia. Pale gray overall with two bold black wingbars and iridescent purple and green on neck. Feral varieties are common in cities and farmland, often in large flocks. Variable plumage: some identical to wild-type birds, but can be completely black, white, or orangey-brown and any combination in between.


## Create a dataframe of all the chunks

In [40]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(chunks)
print(df.shape)
df.head()

(351, 2)


Unnamed: 0,text,chunk_id
0,Species: Rock Pigeon\nInfo: Fairly large pigeo...,5bbd095cd23f4130a6d672640d5b83eb
1,Species: Eurasian Collared-Dove\nInfo: Large p...,b2759167891e430da492381acb859f56
2,"Species: Laughing Dove\nInfo: Small, similar i...",00beb63d4e7145efa09c11aec218c9ec
3,"Species: Black Kite\nInfo: Medium-sized, rathe...",0d64fb5953ac4becba9fafbce9434980
4,Species: Rose-ringed Parakeet\nInfo: Vibrantly...,d861ec9a63df40f2a917cfb3f49b79ed


## Extract Concepts

In [41]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [42]:
outputdirectory = Path("./data_output")

In [43]:
df = df.loc[5:100]

In [44]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='mistral:latest')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph2.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks2.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

An error occurred: 500 Server Error: Internal Server Error for url: http://localhost:11434/api/generate


ERROR ### Here is the buggy response:  None 


 [
   {
       "node_1": "Common Tailorbird",
       "node_2": "warblerlike bird",
       "edge": "The Common Tailorbird is a warblerlike bird."
   },
   {
       "node_1": "Common Tailorbird",
       "node_2": "green back",
       "edge": "The Common Tailorbird has a green back."
   },
   {
       "node_1": "Common Tailorbird",
       "node_2": "reddish crown",
       "edge": "The Common Tailorbird has a reddish crown."
   },
   {
       "node_1": "Common Tailorbird",
       "node_2": "long bill",
       "edge": "The Common Tailorbird has a long bill."
   },
   {
       "node_1": "Common Tailorbird",
       "node_2": "parks, gardens, and wooded areas",
       "edge": "The Common Tailorbird is an inhabitant of parks, gardens, and wooded areas."
   },
   {
       "node_1": "Common Tailorbird",
       "node_2": "South and Southeast Asia"

Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,common tailorbird,warblerlike bird,The Common Tailorbird is a warblerlike bird.,9ad3f5260d19447388f177864213431a,4
1,common tailorbird,green back,The Common Tailorbird has a green back.,9ad3f5260d19447388f177864213431a,4
2,common tailorbird,reddish crown,The Common Tailorbird has a reddish crown.,9ad3f5260d19447388f177864213431a,4
3,common tailorbird,long bill,The Common Tailorbird has a long bill.,9ad3f5260d19447388f177864213431a,4
4,common tailorbird,"parks, gardens, and wooded areas",The Common Tailorbird is an inhabitant of park...,9ad3f5260d19447388f177864213431a,4


## Calculating contextual proximity

In [45]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
26337,yellow-wattled lapwing,vocalization,"a5d8944eebdf453785ae4bbec9ba0309,a5d8944eebdf4...",2,contextual proximity
26341,yellowish feet (golden slippers),breeding adult little egret,"a28314cd80e24dc8bda647596c9aefdb,a28314cd80e24...",2,contextual proximity
26344,yellowish feet (golden slippers),little egret,"a28314cd80e24dc8bda647596c9aefdb,a28314cd80e24...",10,contextual proximity
26353,zebra-striped wings,eurasian hoopoe,"e30e2d93a6194996b9c06e4aef6473d9,e30e2d93a6194...",10,contextual proximity
26366,zwee! calls,indian white-eye,"131a43367f8a41cb850d5d325732150f,131a43367f8a4...",12,contextual proximity


### Merge both the dataframes

In [46]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,2 long wispy head plumes,breeding adult little egret,"a28314cd80e24dc8bda647596c9aefdb,a28314cd80e24...",contextual proximity,2
1,2 long wispy head plumes,little egret,"a28314cd80e24dc8bda647596c9aefdb,a28314cd80e24...",contextual proximity,10
2,absence of blue,bluethroat,"2162f35ac20b45a6adff6fc31a9ff675,2162f35ac20b4...",contextual proximity,6
3,activity,calls,"dfbf1312e1134629ac4d830dca56d2c9,dfbf1312e1134...",contextual proximity,2
4,activity,color,"dfbf1312e1134629ac4d830dca56d2c9,dfbf1312e1134...",contextual proximity,4
...,...,...,...,...,...
6826,yellow-wattled lapwing,vocalization,"a5d8944eebdf453785ae4bbec9ba0309,a5d8944eebdf4...",contextual proximity,2
6827,yellowish feet (golden slippers),breeding adult little egret,"a28314cd80e24dc8bda647596c9aefdb,a28314cd80e24...",contextual proximity,2
6828,yellowish feet (golden slippers),little egret,"a28314cd80e24dc8bda647596c9aefdb,a28314cd80e24...",contextual proximity,10
6829,zebra-striped wings,eurasian hoopoe,"e30e2d93a6194996b9c06e4aef6473d9,e30e2d93a6194...",contextual proximity,10


## Calculate the NetworkX Graph

In [47]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(1334,)

In [48]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [49]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  8
[['2 long wispy head plumes', 'blackish legs', 'breeding adult little egret', 'compare with larger great and intermediate egrets, stockier cattle egret, and white morph reef herons', 'distinctive within its range', 'little egret', 'nests and roosts communally', 'occurs as singles or small loose groups', 'slender dark bill', 'small fish', 'small snow-white heron', 'spray of white plumes (aigrettes)', 'wetlands (lakes, rivers, marshes, estuaries)', 'yellowish feet (golden slippers)'], ['absence of blue', 'bluethroat', 'bluethroat (breeds)', 'bluethroat (even the dullest young females)', 'bluethroat (females lacking blue entirely)', 'bluethroat (females)', 'bluethroat (head-on view)', 'bluethroat (singing males)', 'bluethroat (variable song)', 'bluethroat (winters)', 'bold white eyebrow and throat, necklace of dark streaks, rufous on the base of the tail', 'conspicuous perch', 'dull gray (above)', 'edge areas', 'electric blue and orange', 'europe and asia into a

### Create a dataframe for community colors

In [50]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,2 long wispy head plumes,#91db57,1
1,blackish legs,#91db57,1
2,breeding adult little egret,#91db57,1
3,compare with larger great and intermediate egr...,#91db57,1
4,distinctive within its range,#91db57,1
...,...,...,...
1329,shy,#57db80,8
1330,silver-tipped feathers,#57db80,8
1331,tree canopy,#57db80,8
1332,two hoarse notes followed by two clear notes '...,#57db80,8


### Add colors to the graph

In [51]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [52]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
