## Setup

In [27]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
# data_dir = "cureus"
data_dir = "bookwise"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [28]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.65it/s]

Number of chunks =  11
Match Percentage

85

-------------

100

Note Title The Brain's Pattern Recognition and Prediction Abilities

Page Content Summary The human brain develops pattern recognition abilities through repeated exposure Examples include a woman recognizing heart failure symptoms, military analysts identifying enemy missiles on radar Museum curators can discern authentic art from counterfeits, radiologists predict stroke areas before visible signs The brain continuously analyzes surroundings and information, noticing important details With practice, the brain can pick up on predictive cues without conscious thought

User Notes Summary The human brain is a prediction machine The brain continuously takes in surroundings and analyzes information Repeated experiences lead to noticing important details and highlighting relevant cues Practice allows picking up on predictive cues without conscious thought The brain automatically encodes lessons learned through experience





## Create a dataframe of all the chunks

In [29]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(11, 3)


Unnamed: 0,text,source,chunk_id
0,Book Title: Atomic Habits\n\nBook Author: Jame...,data_input/bookwise/Atomic_Habits_notes.txt,56dd4d06693e49359645da7b2077ad70
1,Comparison Analysis The user's notes closely a...,data_input/bookwise/Atomic_Habits_notes.txt,8c781f8eb86447ad8f36be4dc492ae7a
2,User Notes Summary More probable behaviors wil...,data_input/bookwise/Atomic_Habits_notes.txt,5bb5ab7d635d4d4c8ffa9d2610e01a39
3,Match Percentage\n\n85\n\n-------------\n\n100...,data_input/bookwise/Atomic_Habits_notes.txt,23d73192f55f4200b47742bcef0314ae
4,Comparison Analysis The user's notes focus pri...,data_input/bookwise/Atomic_Habits_notes.txt,8d4e3b39690a4978815cbe2fc6fd2fed


## Extract Concepts

In [30]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [31]:
## To regenerate the graph with LLM, set this to True
# regenerate = False
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='zephyr:latest')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

[
   {
      "node_1": "Atomic Habits",
      "node_2": "Habit Formation",
      "edge": "The author relied on small habits to overcome injury and achieve success in various areas of life. The book offers a step-by-step plan for building better habits for a lifetime, focusing on the science of habit creation and change. This book draws from biology, neuroscience, philosophy, and psychology, presenting a synthesis of established ideas and recent scientific discoveries. The core of the book is a four-step model of habits: cue, craving, response, and reward."
   },
   {
      "node_1": "Atomic Habits",
      "node_2": "James Clear",
      "edge": "The author of 'Atomic Habits' is James Clear."
   },
   {
      "node_1": "Habit Formation",
      "node_2": "Small Habits",
      "edge": "The author relied on small habits to overcome injury and achieve success in various areas of life, as mentioned in 'Introduction to Habit Formation'."
   },
   {
      "node_1": "Habit Formation",
      "nod

Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,atomic habits,habit formation,The author relied on small habits to overcome ...,56dd4d06693e49359645da7b2077ad70,4
1,atomic habits,james clear,The author of 'Atomic Habits' is James Clear.,56dd4d06693e49359645da7b2077ad70,4
2,habit formation,small habits,The author relied on small habits to overcome ...,56dd4d06693e49359645da7b2077ad70,4
3,habit formation,behavior change,The book offers a step-by-step plan for buildi...,56dd4d06693e49359645da7b2077ad70,4
4,skinner's operant conditioning,"cue, craving, response, reward",The author's model builds upon B.F. Skinner's ...,56dd4d06693e49359645da7b2077ad70,4


## Calculating contextual proximity

In [32]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
618,watching sports,premack's principle,"5bb5ab7d635d4d4c8ffa9d2610e01a39,5bb5ab7d635d4...",2,contextual proximity
619,watching sports,temptation bundling,"5bb5ab7d635d4d4c8ffa9d2610e01a39,5bb5ab7d635d4...",2,contextual proximity
623,what progress is really like,current state,"fda470fcb3ad4bb69df2e65fd574b53d,fda470fcb3ad4...",2,contextual proximity
624,what progress is really like,desired state,"fda470fcb3ad4bb69df2e65fd574b53d,fda470fcb3ad4...",2,contextual proximity
630,what progress is really like,predictions,"fda470fcb3ad4bb69df2e65fd574b53d,fda470fcb3ad4...",2,contextual proximity


### Merge both the dataframes

In [33]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,action,dopamine spike,"d256bd3e97ac4c90baa927e04c8a1350,d256bd3e97ac4...",contextual proximity,2
1,actions,breakthrough moments,"9a70f5b9874749b0bd855a1d99773256,9a70f5b987474...",Breakthrough moments are the result of many pr...,7
2,actions,changes,"9a70f5b9874749b0bd855a1d99773256,9a70f5b987474...",contextual proximity,2
3,atomic habits,behavior change,"56dd4d06693e49359645da7b2077ad70,56dd4d06693e4...",contextual proximity,2
4,atomic habits,"cue, craving, response, reward","56dd4d06693e49359645da7b2077ad70,56dd4d06693e4...",contextual proximity,2
...,...,...,...,...,...
242,watching sports,temptation bundling,"5bb5ab7d635d4d4c8ffa9d2610e01a39,5bb5ab7d635d4...",contextual proximity,2
243,what progress is really like,current state,"fda470fcb3ad4bb69df2e65fd574b53d,fda470fcb3ad4...",contextual proximity,2
244,what progress is really like,desired state,"fda470fcb3ad4bb69df2e65fd574b53d,fda470fcb3ad4...",contextual proximity,2
245,what progress is really like,note title,fda470fcb3ad4bb69df2e65fd574b53d,The user's notes capture key concepts such as ...,4


## Calculate the NetworkX Graph

In [34]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(66,)

In [35]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [36]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  8
[['action', 'desire', 'dopamine spike'], ['actions', 'breakthrough moments', 'changes', 'frustrating period', 'major changes', 'most powerful outcomes'], ['atomic habits', 'automatic encoding', 'behavior change', 'body', 'brain', 'consistent rituals', 'craving', 'cue, craving, response, reward', 'cue, routine, reward', 'deep focus', 'dopamine', "duhigg's habit formation model", 'habit formation', 'human brain', 'importance of persistence through seemingly ineffective periods', 'james clear', 'nonconsciously', 'pattern recognition', 'pattern recognition abilities', 'peak performance', 'psychological challenges involved', "skinner's operant conditioning", 'small habits', 'start with showing up consistently before focusing on perfecting habits', 'two-minute rule'], ['behavior', 'comparison analysis', 'concrete examples', 'cues', 'current state', 'desired state', 'desired states', 'events', 'feelings', 'habits', 'interpretations', 'match percentage', 'note title'

### Create a dataframe for community colors

In [37]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,action,#57d3db,1
1,desire,#57d3db,1
2,dopamine spike,#57d3db,1
3,actions,#91db57,2
4,breakthrough moments,#91db57,2
...,...,...,...
61,premack's principle,#db57b2,7
62,temptation bundling,#db57b2,7
63,watching sports,#db57b2,7
64,habit research,#57db80,8


### Add colors to the graph

In [38]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [39]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html


python(85390) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
