## 0. Configuration 

In [43]:
## Load packages

from openai import AsyncOpenAI
import pandas as pd
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
from tqdm import tqdm
from string import Template
import json 
from neo4j import GraphDatabase
import glob 
from timeit import default_timer as timer 
from time import sleep 
import nest_asyncio

nest_asyncio.apply()

In [53]:
## OpenAI API credentials
load_dotenv()

openai_api_key= os.getenv('OPENAI_API_KEY')

In [34]:
## Neo4j credentials

neo4j_url = os.getenv("NEO4J_URL")
neo4j_user = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")
gds = GraphDatabase.driver(neo4j_url, auth = (neo4j_user, neo4j_password))

In [56]:
## Helper Functions

## Function to call the OpenAI API

async def process_gpt(file_prompt, system_msg):
    client = AsyncOpenAI(api_key= openai_api_key)

    completion = await client.chat.completions.create(
        model = 'gpt-4o-mini',
        max_tokens = 15000,
        temperature = 0, 
        messages = [
            {"role": "system", "content": system_msg},
            {"role":"user", "content": file_prompt}
        ]
    )
    nlp_results = completion.choices[0].message.content
    return nlp_results

## Function to take folder of fieles and a prompt template, and return a json-object of all the entities and relationships 

async def extract_entities_relationships(content, prompt_template):
    start = timer()
    system_msg = "You are a helpful IT-project and account management expert who extracts information from documents."
    print(f"Running pipeline for {len(content)} files")
    results = []
    for i in range(len(content)):
        print(f"Extracting entities and relationships for file number: {i}")
        text = content[i]
        prompt = Template(prompt_template).substitute(ctext=text)
        result = await process_gpt(prompt, system_msg)
        results.append(result)
    end = timer()
    print(f"Pipeline completed in {end-start} seconds")
    return results

    

## 1. Extract data from the existing database

In [9]:
#Load credentials

db_username = os.getenv('POSTGRES_USERNAME')
db_password = os.getenv('POSTGRES_PASSWORD')
db_host = os.getenv('POSTGRES_HOST')
db_port = os.getenv('POSTGRES_PORT')
db_name = os.getenv('POSTGRES_DATABASE')
DATABASE_URL = os.getenv('DATABASE_URL')

In [15]:
%load_ext sql


The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [16]:
df = %sql SELECT * FROM lse_doc # Save all rows and columns to df 
df = df.DataFrame() #Turn it into a pandas dataframe 

In [46]:
df_content = df["content"] #Save everything in the "content" column to a variable

## 2. Prompt engineering for entity and relations extraction 

In [22]:
#department_process_prompt is to handle academics-related entities and relationships

department_process_prompt = '''From the unstructured data provided, extract the following Entities and relationships described in the mentioned format.
0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
    'id' property of each enetity must be alphanumeric and must be unique among the entities. You will be referring to this property to define the relationships between entities.
    Entity types:
    label: 'Department', id:string, name:string // Academic departments at the London School of Economics, referred to as "Department of ____"; 'id' property is the name of the department, in lowercase and camel-case.
    label: 'Course', id:string, name:string, course_code:string, prerequisites:string, semester:string // Academic courses taught at the London School of Economics; 'id' property is the name of the course, in lowercase and camel-case
    label: 'Faculty', id:string, name:string // Professors, assistant professors, teachers, fellows, tutors, directors, lecturers, and chairs associated with the London School of Economics; 'id' property is the full-name of the faculty member, in lowercase and camel-case
    label: 'Event', id:string, name:string, description:string, date:string // Seminars, workshops, talks, and other events involving the London School of Economics; 'id' property is the name of the event, in lowercase and camel-case
    label: 'Research Project', id:string, name:string, description:string // Research projects led by faculty associated with the London School of Economics; 'id' property is the name of the research project, in lowercase and camel_case


2. Next, generate each relationship as triples of head, relationship, and tail. To refer to the head and tail entity, use their respective 'id' property.
    Relationship types:
    faculty[IS_MEMBER_OF]department
    event[IS_HOSTED_BY]faculty
    event[IS_RUN_BY]department
    course[IS_RUN_BY]department
    course[IS_TAUGHT_BY]faculty
    research_project[IS_LED_BY]faculty
    research_projects[IS_ASSOCIATED_WITH]department

NOTE: Not all documents will have these entities and relationships. If the document does not seem to have aforementioned entities and relationships, DO NOT RECORD THEM.

The output should look like:
{
    "entities": [{"label": "Department", "id": string, "name": string}],
    "relationships": ["faculty[IS_MEMBER_OF]department", "event[IS_RUN_BY]department", "course[IS_RUN_BY]department", "research_projects[IS_ASSOCIATED_WITH]department"]
}

Case Sheet:
$ctext
'''

#administration_process_prompt is to handle bureaucratic and regulatory entities and relationships

administration_process_prompt = '''From the unstructured data provided, extract the following Entities and relationships described in the mentioned format.
0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
    'id' property of each entity must be alphanumeric and must be unique among the entities. You will be referring to this property to define the relationships between entities.
    Entity types:
    label: "Policy", id: string, name:string, summary:string // Policy related to administrative procedure at the London School of Economics; 'id' property is the name of the policy, in lowercase and camel_case
    label: "Procedure", id:string, content:string, summary:string // Procedure associated with certain policies established at the London School of Economics; 'id' property is the first four words of the summary, in lowercase and camel_case
    label: "Committee", id:string, name: string // Regulatory organisation or departments at the London School of Economics, referred to as "Department of ____"; 'id' propoerty is the name of the committee, in lowercase and camel-case
    label: "Group", id:string, name:string // Individuals or organisations; 'id' property is the name of the perosn, in lowercase and camel_case

2. Next, generate each relationship as triples of head, relationship, and tail. To refer to the head and tail entity, use their respective 'id' property.
    Relationship types:
    group[IS_AFFECTED_BY]policy
    procedure[IS_ASSOCIATED_WITH]policy
    policy[IS_ENFORCED_BY]committee

NOTE: Not all documents will have these entities and relationships. If the document does not seem to have aforementioned entities and relationships, DO NOT RECORD THEM.
    
The output should look like:
{ "entities": [{"label": "Policy", "id": string, "name": string}],
    "relationships": ["group[IS_AFFECTED_BY]policy", "procedure[IS_ASSOCIATED_WITH]policy"]
}

Case Sheet: $ctext
''' 

#institutions_process_prompt is to handle institutions at the LSE (research facilities, summer school, careers centre, alumni centre)

#institutions_process_prompt = 
#research_process_prompt = 



In [59]:
async def main():
    df_content = df["content"].to_list()  # Call the method to get the list
    department_process_prompt = "Your template here"  # Define your prompt template

    result = await extract_entities_relationships(df_content, department_process_prompt)

    with open("academic_data.json", "w") as f:
        json.dump(result, f)

# Run the main function
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Running pipeline for 21521 files
Extracting entities and relationships for file number: 0
Extracting entities and relationships for file number: 1
Extracting entities and relationships for file number: 2
Extracting entities and relationships for file number: 3
Extracting entities and relationships for file number: 4
Extracting entities and relationships for file number: 5
Extracting entities and relationships for file number: 6
Extracting entities and relationships for file number: 7
Extracting entities and relationships for file number: 8
Extracting entities and relationships for file number: 9
Extracting entities and relationships for file number: 10


KeyboardInterrupt: 

Extracting entities and relationships for file number: 11
Extracting entities and relationships for file number: 12
Extracting entities and relationships for file number: 13
Extracting entities and relationships for file number: 14
Extracting entities and relationships for file number: 15


## 3. Neo4j Cypher Generation