In [None]:
import os

from neo4j import GraphDatabase
from dotenv import load_dotenv
load_dotenv()

In [None]:
def _process_node(record, node_name):
    return {
        "elem_id": record[node_name].element_id,
        "type": list(record[node_name].labels)[0],
        **dict(record[node_name])
    }

def _process_edge(record, edge_name):
    return {
        "elem_id": record[edge_name].element_id,
        "type": record[edge_name].type,
        **dict(record[edge_name])
    }

def process_results(result):
    return [
        {
            "node1": _process_node(rec, 'n'),
            "node2": _process_node(rec, 'm'),
            "relationship": _process_edge(rec, 'r')
        }
        for rec in result
    ]

def get_all_nodes_and_relationships(tx):
    result = tx.run("MATCH (n)-[r]->(m) RETURN n,r,m")
    return [rec for rec in result]

In [None]:
# Neo4j AuraDB connection details
uri = os.getenv("NEO4J_URI")
password = os.getenv("NEO4J_KEY")
username = 'neo4j'

In [None]:
# Connect to Neo4j AuraDB
driver = GraphDatabase.driver(uri, auth=(username, password))

with driver.session() as session:
    nodes = session.execute_read(get_all_nodes_and_relationships)
    nodes = process_results(nodes)
driver.close()

In [None]:
len(nodes)

In [None]:
print(nodes[0]['node1'])

In [None]:
print(nodes[0]['node2'])

In [None]:
print(nodes[0]['relationship'])

In [None]:
import plotly as px


In [None]:
colors = (
    px.colors.qualitative.T10 +
    px.colors.qualitative.Plotly + 
    px.colors.qualitative.Alphabet + 
    px.colors.qualitative.Bold +
    px.colors.qualitative.Pastel +
    px.colors.qualitative.Prism +
    px.colors.qualitative.Safe +
    px.colors.qualitative.Vivid +
    px.colors.qualitative.Light24 +
    px.colors.qualitative.Dark24
)
# colors = list(set(colors))
unique_colors = []
for color in colors:
    if color not in unique_colors:
        unique_colors.append(color)

In [None]:
unique_colors

In [None]:
from pathlib import Path
import pandas as pd

path = Path('data')

In [None]:
df = pd.Series([str(p) for p in path.rglob('*.txt')], name='Path').to_frame()
print(df.shape)
df['DocumentName'] = df['Path'].str.split('/').str[1]
df['DateProcessed'] = df['Path'].str.split('/').str[3]
df['PageName'] = df['Path'].str.split('/').str[-1]
df = df.loc[df['PageName'] != 'complete.txt']
print(df.shape)
df = df.drop_duplicates(subset=['DocumentName', 'PageName'])
print(df.shape)
df['PageId'] = df['PageName'].str.split('.').str[0]
df

In [None]:
!pip freeze | grep pandas

In [None]:
df_images = pd.Series([str(p) for p in path.rglob('*.png')], name='ImagePath').to_frame()
df_images['DocumentName'] = df_images['ImagePath'].str.split('/').str[1]
df_images['ImageName'] = df_images['ImagePath'].str.split('/').str[-1]
df_images['ImageId'] = df_images['ImageName'].str.split('.').str[0]
df_images = df_images.loc[~df_images['ImageName'].str.contains('.bin')]
df_images = df_images.loc[~df_images['ImageName'].str.contains('.nrm')]
df_images = df_images.drop_duplicates(subset=['DocumentName', 'ImageName'])
df_images


In [None]:
df = df.merge(df_images, left_on=('DocumentName', 'PageId'), right_on=('DocumentName', 'ImageId'), how='left')

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
from src.description_generation.describer import OpenAIDescriber

In [None]:
row = df.iloc[0]

In [None]:
row = df.query('ImageName == "0000.png"').iloc[0]

In [None]:
describer = OpenAIDescriber()
resp, usage = describer.describe(row['Path'], row['ImagePath'])

In [None]:
resp

In [None]:
usage

In [None]:
row

In [None]:
import sys 
sys.path.append('src')

from src.onboard_graph import load_data

In [None]:
data = load_data('src/../data')

In [None]:
df.shape

In [None]:
data = pd.read_csv('src/desc_backup.csv')
data.shape


In [None]:
df = pd.concat([df, data], axis=1)

In [None]:
s = df.loc[df['Path'].str.contains('04 05 2022')].query('PageId == "0000"')['Description'].squeeze()

In [None]:
print(s.replace('```plaintext', '').replace('```', ''))