Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Knowledgraph Pipeline


## Contributers:
Tom Hargrove
Carl Koster
Hantao Lin
Allen Wang

## Description

In [None]:
%pip install --upgrade --quiet  langchain langchain-community langchain-openai langchain-experimental neo4j

In [3]:
import os
import time
import random
import logging
from datetime import datetime
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
from langchain_community.graphs import Neo4jGraph
from langchain_core.documents import Document
from dotenv import load_dotenv

# Load environment variables from a .env file
load_dotenv()

# Initialize environment variables
def init_env_vars():
    os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")
    os.environ["NEO4J_URI"] = os.environ.get("NEO4J_URI")
    os.environ["NEO4J_USERNAME"] = os.environ.get("NEO4J_USERNAME")
    os.environ["NEO4J_PASSWORD"] = os.environ.get("NEO4J_PASSWORD")
    os.environ["NEO4J_DB"] = os.environ.get("NEO4J_DB")
    
num_files_to_process = 1  # Set the number of files to sample

folder_path = r'b:\OneDrive\Documents\GitHub\EA-Knowledge-Bot\Final Deliverables\Code\SUMMARIES'
log_dir = r'b:\OneDrive\Documents\GitHub\EA-Knowledge-Bot\Final Deliverables\Code\LOGS'
log_file = os.path.join(log_dir, 'processed_files_log.txt')
os.makedirs(log_dir, exist_ok=True)  # Ensure the log directory exists

# TEXT

In [4]:


def has_been_processed(file_path, log_file):
    if not os.path.exists(log_file):
        return False
    
    with open(log_file, 'r') as f:
        for line in f:
            if file_path in line and "SUCCESS" in line:
                return True
    return False

from datetime import datetime

def log_processed_file(file_path, status, log_file, processing_time):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    file_name = os.path.basename(file_path)
    log_entry = f"{timestamp}, {processing_time:.2f}, {file_name}, NER, {status}\n"
    
    with open(log_file, 'a') as f:
        f.write(log_entry)

# Initialize language model
def init_llm():
    return ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")

# Initialize Neo4j graph connection
def init_neo4j_graph():
    try:
        return Neo4jGraph()
    except Exception as e:
        print(f"Error connecting to Neo4j: {str(e)}")
        return None

# Load and process a file
def process_file(file_path, log_file):
    try:
        start_time = time.time()
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            document = Document(page_content=text)
        processing_time = time.time() - start_time
        log_processed_file(file_path, "SUCCESS", log_file, processing_time)
        return document
    except Exception as e:
        processing_time = time.time() - start_time
        log_processed_file(file_path, "FAILED", log_file, processing_time)
        return None


def process_random_files(folder_path, num_files, log_file):
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"The directory '{folder_path}' does not exist.")

    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    random_sample = random.sample(files, k=min(num_files, len(files)))

    documents = []
    for file_name in random_sample:
        file_path = os.path.join(folder_path, file_name)
        if has_been_processed(file_path, log_file):
            print(f"Skipping file: {file_name}")
            continue

        document = process_file(file_path, log_file)
        if document:
            documents.append(document)

    return documents

def main():
    init_env_vars()
    llm = init_llm()
    graph = init_neo4j_graph()



    

    try:
        processed_documents = process_random_files(folder_path, num_files_to_process, log_file)
        
        for document in processed_documents:
            try:
                graph_document = LLMGraphTransformer(llm=llm).convert_to_graph_documents([document])[0]
                print(f"Nodes: {graph_document.nodes}")
                print(f"Relationships: {graph_document.relationships}")

                # Add graph document to Neo4j if the connection was successful
                if graph is not None:
                    graph.add_graph_documents([graph_document])
            except Exception as e:
                print(f"Error processing document: {document.page_content[:30]}... : {str(e)}")
    except ModuleNotFoundError as e:
        print(f"Module error: {str(e)}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

    try:
        print(graph)
        visualize(graph)  # Visualize the graph using the graph object
    except Exception as e:
        print(f"Error connecting to Neo4j or visualizing graph: {str(e)}")

if __name__ == "__main__":
    main()

Nodes: [Node(id='Confidentiality', type='Goal'), Node(id='Integrity', type='Goal'), Node(id='Availability', type='Goal'), Node(id='Privacy', type='Concept'), Node(id='Safety', type='Concept'), Node(id='Reliability', type='Concept'), Node(id='Survivability', type='Concept'), Node(id='Cyber Risk', type='Concept'), Node(id='Primer', type='Concept'), Node(id='Cyber Physical Systems Security', type='Concept'), Node(id='Approaches', type='Concept'), Node(id='Business Continuity Management', type='Concept'), Node(id='Organizational Resilience', type='Concept'), Node(id='Corporate Security', type='Concept'), Node(id='Risk Leaders', type='Concept'), Node(id='Cross Functional Collaboration', type='Concept'), Node(id='Decision Making', type='Concept'), Node(id='Resilience', type='Concept'), Node(id='Organization', type='Concept'), Node(id='Enterprise System', type='Concept'), Node(id='Processes', type='Concept'), Node(id='Information', type='Concept'), Node(id='Risk Practitioners', type='Concept'