## 0. Configuration 

In [3]:
## Load packages

from openai import AsyncOpenAI
import pandas as pd
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
from tqdm import tqdm
from string import Template
import json 
from neo4j import GraphDatabase
import glob 
from timeit import default_timer as timer 
from time import sleep 
import nest_asyncio

nest_asyncio.apply()

In [53]:
## OpenAI API credentials
load_dotenv()

openai_api_key= os.getenv('OPENAI_API_KEY')

In [34]:
## Neo4j credentials

neo4j_url = os.getenv("NEO4J_URL")
neo4j_user = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")
gds = GraphDatabase.driver(neo4j_url, auth = (neo4j_user, neo4j_password))

TERRY: Why async? Even if you want to embed this in the crawler pipeline they'll all be sync. I mean not that it makes much difference for sync and async though 🤷‍♂️

In [56]:
## Helper Functions

## Function to call the OpenAI API

async def process_gpt(file_prompt, system_msg):
    client = AsyncOpenAI(api_key= openai_api_key)

    completion = await client.chat.completions.create(
        model = 'gpt-4o-mini',
        max_tokens = 15000,
        temperature = 0, 
        messages = [
            {"role": "system", "content": system_msg},
            {"role":"user", "content": file_prompt}
        ]
    )
    nlp_results = completion.choices[0].message.content
    return nlp_results

## Function to take folder of files and a prompt template, and return a json-object of all the entities and relationships 

async def extract_entities_relationships(content, prompt_template):
    start = timer()
    system_msg = "You are a helpful IT-project and account management expert who extracts information from documents."
    print(f"Running pipeline for {len(content)} files")
    results = []
    for i in range(len(content)):
        print(f"Extracting entities and relationships for file number: {i}")
        text = content[i]
        prompt = Template(prompt_template).substitute(ctext=text)
        result = await process_gpt(prompt, system_msg)
        results.append(result)
    end = timer()
    print(f"Pipeline completed in {end-start} seconds")
    return results

    

## 1. Extract data from the existing database

In [9]:
#Load credentials

db_username = os.getenv('POSTGRES_USERNAME')
db_password = os.getenv('POSTGRES_PASSWORD')
db_host = os.getenv('POSTGRES_HOST')
db_port = os.getenv('POSTGRES_PORT')
db_name = os.getenv('POSTGRES_DATABASE')
DATABASE_URL = os.getenv('DATABASE_URL')

UsageError: Line magic function `%sql` not found.


In [1]:
df = %sql SELECT * FROM lse_doc # Save all rows and columns to df 
df = df.DataFrame() #Turn it into a pandas dataframe 

UsageError: Line magic function `%sql` not found.


TERRY: For the purpose of this task, do you think we can simply use the document chunks (this is what you'll get from `df["content"]`)? Obviously if the model sees the entire document it will have a more proper understanding of the content of the document, but I'm also aware that if the input context is too long the model might get lost. I am not entirely sure from what input length the model response will start deteriorating. I'm currently leaning towards passing the entire document through though, as we will for sure be missing something or, worse, making some mistakes if we only pass chunks to the model

In [46]:
df_content = df["content"] #Save everything in the "content" column to a variable

## 2. Prompt engineering for entity and relations extraction 

TERRY: 
- When you define departments, does that also include institutes and centres? E.g. DSI, Eden Center. If not, should there be an entity for that? 
- I'm not sure if we have the course information yet? Like the school calender page that has all course and programme info. If we don't, we definitely should scrape that 
- I'm pretty sure we don't have the LSESU data yet. I think Kristina was in the process of scraping those information. But we might need to think about how to better organise that data before we're ready to pass it through this yet 
- For the SU data, when we come to it, maybe we should also include stuff for like who's the general secretary and who are the officers for xxx? I wonder if that would be worth creating a relationship for tho since there aren't that man 
- Is it worth adding something like faculty[OCCUPIES]office or faculty[RESIDES_IN]building or something along that line? I see that you have faculty[IS_MEMBER_OF]department and office[IS_LOCATED_IN]building, but I wonder if it would be worth adding a relation between faculty and office/building? I also don't know how to best do that though 
- When you say course here in `department_process_prompt`, do you mean modules (e.g. DS105) or programmes (e.g. BSc Economics)? I think we should include both. With the inclusion of programmes, maybe also include a relation for a faculty being the program director of a course (I could see this being useful for stuff like "I'm interested in xxx, who should I talk to to learn more about this programme")? 

Also, since you want JSON output, I know OpenAI has recently released the JSON mode which ensures 100% JSON formatted outputs. However I have seen people say that the output was a bit worse though. Obviously if you want to use Llama/Mistral you can use the JSON mode built into the API (or even function calling if you want to reshape the prompt into functions). I have seen people experimenting with graphRAG, and they seem to say that this knowledge graph generation process relies very heavily on the quality of the LLM. I'd say if you do use our local llama/mistral and they don't turn out to be good, try the best GPT-4o (will have to talk to Jon first about reimbursement though). 

In [22]:
#department_process_prompt is to handle academics-related entities and relationships

department_process_prompt = '''From the unstructured data provided, extract the following Entities and relationships described in the mentioned format.
0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
    'id' property of each entity must be alphanumeric and must be unique among the entities. You will be referring to this property to define the relationships between entities.
    Entity types:
    label: 'Department', id:string, name:string // Academic departments at the London School of Economics, referred to as "Department of ____"; 'id' property is the name of the department, in lowercase and camel-case.
    label: 'Course', id:string, name:string, course_code:string, prerequisites:string, semester:string, course_content_summary:string // Academic courses taught at the London School of Economics; 'id' property is the name of the course, in lowercase and camel-case
    label: 'Faculty', id:string, name:string, reserach_interests:string, email:string // Professors, assistant professors, teachers, fellows, tutors, directors, lecturers, and chairs associated with the London School of Economics; 'id' property is the full-name of the faculty member, in lowercase and camel-case
    label: 'Event', id:string, name:string, description:string, date:string // Seminars, workshops, talks, and other events involving the London School of Economics; 'id' property is the name of the event, in lowercase and camel-case
    label: 'Research Project', id:string, name:string, description:string // Research projects led by faculty associated with the London School of Economics; 'id' property is the name of the research project, in lowercase and camel_case


2. Next, generate each relationship as triples of head, relationship, and tail. To refer to the head and tail entity, use their respective 'id' property.
    Relationship types:
    faculty[IS_MEMBER_OF]department
    event[IS_HOSTED_BY]faculty
    event[IS_RUN_BY]department
    course[IS_RUN_BY]department
    course[IS_TAUGHT_BY]faculty
    research_project[IS_LED_BY]faculty
    research_projects[IS_ASSOCIATED_WITH]department

NOTE: Not all documents will have these entities and relationships. If the document does not seem to have aforementioned entities and relationships, DO NOT RECORD THEM.

The output should look like:
{
    "entities": [{"label": "Department", "id": string, "name": string}],
    "relationships": ["faculty[IS_MEMBER_OF]department", "event[IS_RUN_BY]department", "course[IS_RUN_BY]department", "research_projects[IS_ASSOCIATED_WITH]department"]
}

Case Sheet:
$ctext
'''

#administration_process_prompt is to handle bureaucratic and regulatory entities and relationships

administration_process_prompt = '''From the unstructured data provided, extract the following Entities and relationships described in the mentioned format.
0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
    'id' property of each entity must be alphanumeric and must be unique among the entities. You will be referring to this property to define the relationships between entities.
    Entity types:
    label: "Policy", id: string, name:string, summary:string // Policy related to administrative procedure at the London School of Economics; 'id' property is the name of the policy, in lowercase and camel_case
    label: "Procedure", id:string, content:string, summary:string // Procedure associated with certain policies established at the London School of Economics; 'id' property is the first four words of the summary, in lowercase and camel_case
    label: "Committee", id:string, name: string // Regulatory organisation or departments at the London School of Economics, referred to as "Department of ____"; 'id' propoerty is the name of the committee, in lowercase and camel-case
    label: "Group", id:string, name:string // Individuals or organisations that are affected by certain policies at the London School of Economics, which can include teachers, professors, students, students of a certain background, teachers or students of a certain department; 'id' property is the name of the perosn, in lowercase and camel_case

2. Next, generate each relationship as triples of head, relationship, and tail. To refer to the head and tail entity, use their respective 'id' property.
    Relationship types:
    group[IS_AFFECTED_BY]policy
    procedure[IS_ASSOCIATED_WITH]policy
    policy[IS_ENFORCED_BY]committee

NOTE: Not all documents will have these entities and relationships. If the document does not seem to have aforementioned entities and relationships, DO NOT RECORD THEM.
    
The output should look like:
{
    "entities": [{"label": "Policy", "id": string, "name": string}],
    "relationships": ["group[IS_AFFECTED_BY]policy", "procedure[IS_ASSOCIATED_WITH]policy"]
}

Case Sheet: 
$ctext
''' 
#research_process_prompt to process reseach documents/archives/blogs

research_process_prompt = '''From the unstructured data provided, extract the following Entities and relationships described in the mentioned format.
0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
    'id' property of each entity must be alphanumeric and must be unique among the entities. You will be referring to this property to define the relationships between entities.
    Entity types:
    label: 'ResearchDocument', id:string, title:string, summary:string, date:string // A research blog, document, project, or study published and/or conducted at the London School of Economics; 'id' property is the title of the project in lowercase and camel_case
    label: 'Researcher', id:string, name:string, department:string // An individual conducting or contributing to research at the London School of Economics; 'id' property is the name of the researcher in lowercase and camel_case
    label: 'Department', id:string, name:string // Academic departments at the London School of Economics, referred to as "Department of ____"; 'id' property is the name of the department, in lowercase and camel-case.
    label: 'Publication', id:string, title:string, summary:string // A publication resulting from research conducted at LSE; 'id' property is the title of the publication in lowercase and camel_case
    label: 'FundingSource', id:string, name:string // An entity providing funding for research projects; 'id' property is the name of the funding source in lowercase and camel_case

2. Next, generate each relationship as triples of head, relationship, and tail. To refer to the head and tail entity, use their respective 'id' property.
    Relationship types:
    researcher[CONDUCTS]researchProject
    researchProject[IS_ASSOCIATED_WITH]department
    researchProject[RESULTS_IN]publication
    researchProject[IS_FUNDED_BY]fundingSource
    publication[IS_AUTHORED_BY]researcher

NOTE: Not all documents will have these entities and relationships. If the document does not seem to have aforementioned entities and relationships, DO NOT RECORD THEM.
    
The output should look like:
{ "entities": [{"label": "ResearchProject", "id": string, "title": string}],
    "relationships": ["researcher[CONDUCTS]researchProject", "researchProject[RESULTS_IN]publication"]
}

Case Sheet: 
$ctext
'''

#student_union_process_prompt to mainly go over society related activities, but also include other facilities/services provided at the SU

student_union_process_prompt = '''From the unstructured data provided, extract the following Entities and relationships described in the mentioned format.
0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
    'id' property of each entity must be alphanumeric and must be unique among the entities. You will be referring to this property to define the relationships between entities.
    Entity types:
    label: 'Society', id:string, name:string, type:string, activities:string, membership_cost:string // Clubs, sports clubs, and other student societies at the London School of Economics; 'id' property is the name of the society in lowercase and camel_case
    label: 'HeadOfSociety', id:string, name:string // Individuals leading a society, such as presidents or captains; 'id' property is the name of the individual in lowercase and camel_case
    label: 'Facility', id:string, name:string, location:string // Facilities offered by the Student Union, such as the gym, faith center, performance venues, etc.; 'id' property is the name of the facility in lowercase and camel_case
    label: 'Event', id:string, name:string, date:string, location:string // Events organized or hosted by societies or the Student Union, such as parties, seminars, fairs, forums, etc.; 'id' property is the name of the event in lowercase and camel_case
    label: 'Support', id:string, name:string // Support services provided by the Student Union, such as counseling, financial aid, or advisory services; 'id' property is the name of the support service in lowercase and camel_case

2. Next, generate each relationship as triples of head, relationship, and tail. To refer to the head and tail entity, use their respective 'id' property.
    Relationship types:
    headOfSociety[LEADS]society
    society[USES]facility
    event[IS_HOSTED_BY]society
    event[TAKES_PLACE_AT]facility
    support[IS_PROVIDED_BY]facility

NOTE: Not all documents will have these entities and relationships. If the document does not seem to have aforementioned entities and relationships, DO NOT RECORD THEM.

The output should look like:
{
    "entities": [{"label": "Society", "id": string, "name": string}],
    "relationships": ["headOfSociety[LEADS]society", "society[USES]facility", "event[IS_HOSTED_BY]society"]
}

Case Sheet:
$ctext
'''

#infrastructure_prompt to process building and facility information

infrastructure_prompt = '''From the unstructured data provided, extract the following Entities and relationships described in the mentioned format.
0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
    'id' property of each entity must be alphanumeric and must be unique among the entities. You will be referring to this property to define the relationships between entities.
    Entity types:
    label: 'Building', id:string, name:string, type:string, address:string // Academic buildings at the London School of Economics; 'id' property is the name of the building in lowercase and camel_case
    label: 'Facility', id:string, name:string, location:string // Specific facilities within buildings, such as libraries, cafeterias, or common rooms; 'id' property is the name of the facility in lowercase and camel_case
    label: 'Department', id:string, name:string // Academic departments at the London School of Economics; 'id' property is the name of the department in lowercase and camel_case
    label: 'Office', id:string, name:string, location:string // Offices or administrative units within buildings; 'id' property is the name of the office in lowercase and camel_case

2. Next, generate each relationship as triples of head, relationship, and tail. To refer to the head and tail entity, use their respective 'id' property.
    Relationship types:
    department[IS_LOCATED_IN]building
    office[IS_LOCATED_IN]building
    facility[IS_PART_OF]building

NOTE: Not all documents will have these entities and relationships. If the document does not seem to have aforementioned entities and relationships, DO NOT RECORD THEM.

The output should look like:
{
    "entities": [{"label": "Building", "id": string, "name": string}],
    "relationships": ["department[IS_LOCATED_IN]building", "facility[IS_PART_OF]building", "accommodation[PROVIDES]facility"]
}

Case Sheet:
$ctext
'''

# putting everything into full_process_prompt just in case it proves more effective than having individual prompts

full_process_prompt = '''From the unstructured data provided, extract the following Entities and relationships described in the mentioned format.
0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
    'id' property of each enetity must be alphanumeric and must be unique among the entities. You will be referring to this property to define the relationships between entities.
    Entity types:
    label: 'Department', id:string, name:string // Academic departments at the London School of Economics, referred to as "Department of ____"; 'id' property is the name of the department, in lowercase and camel-case.
    label: 'Course', id:string, name:string, course_code:string, prerequisites:string, semester:string // Academic courses taught at the London School of Economics; 'id' property is the name of the course, in lowercase and camel-case
    label: 'Faculty', id:string, name:string // Professors, assistant professors, teachers, fellows, tutors, directors, lecturers, and chairs associated with the London School of Economics; 'id' property is the full-name of the faculty member, in lowercase and camel-case
    label: 'Event', id:string, name:string, description:string, date:string // Seminars, workshops, talks, and other events involving the London School of Economics; 'id' property is the name of the event, in lowercase and camel-case
    label: 'Research Project', id:string, name:string, description:string // Research projects led by faculty associated with the London School of Economics; 'id' property is the name of the research project, in lowercase and camel_case
    label: "Policy", id: string, name:string, summary:string // Policy related to administrative procedure at the London School of Economics; 'id' property is the name of the policy, in lowercase and camel_case
    label: "Procedure", id:string, content:string, summary:string // Procedure associated with certain policies established at the London School of Economics; 'id' property is the first four words of the summary, in lowercase and camel_case
    label: "Committee", id:string, name: string // Regulatory organisation or departments at the London School of Economics, referred to as "Department of ____"; 'id' propoerty is the name of the committee, in lowercase and camel-case
    label: "Group", id:string, name:string // Individuals or organisations; 'id' property is the name of the perosn, in lowercase and camel_case
    label: 'ResearchProject', id:string, title:string, summary:string // A research project or study conducted at the London School of Economics; 'id' property is the title of the project in lowercase and camel_case
    label: 'Researcher', id:string, name:string, department:string // An individual conducting or contributing to research at the London School of Economics; 'id' property is the name of the researcher in lowercase and camel_case
    label: 'Department', id:string, name:string // Academic departments at the London School of Economics, referred to as "Department of ____"; 'id' property is the name of the department, in lowercase and camel-case.
    label: 'Publication', id:string, title:string, summary:string // A publication resulting from research conducted at LSE; 'id' property is the title of the publication in lowercase and camel_case
    label: 'FundingSource', id:string, name:string // An entity providing funding for research projects; 'id' property is the name of the funding source in lowercase and camel_case
    label: 'Society', id:string, name:string, type:string, activities:string, membership_cost:string // Clubs, sports clubs, and other student societies at the London School of Economics; 'id' property is the name of the society in lowercase and camel_case
    label: 'HeadOfSociety', id:string, name:string // Individuals leading a society, such as presidents or captains; 'id' property is the name of the individual in lowercase and camel_case
    label: 'Facility', id:string, name:string, location:string // Facilities offered by the Student Union, such as the gym, faith center, performance venues, etc.; 'id' property is the name of the facility in lowercase and camel_case
    label: 'Event', id:string, name:string, date:string, location:string // Events organized or hosted by societies or the Student Union, such as parties, seminars, fairs, forums, etc.; 'id' property is the name of the event in lowercase and camel_case
    label: 'Support', id:string, name:string // Support services provided by the Student Union, such as counseling, financial aid, or advisory services; 'id' property is the name of the support service in lowercase and camel_case
    label: 'Building', id:string, name:string, type:string, address:string // Academic buildings at the London School of Economics; 'id' property is the name of the building in lowercase and camel_case
    label: 'Facility', id:string, name:string, location:string // Specific facilities within buildings, such as libraries, cafeterias, or common rooms; 'id' property is the name of the facility in lowercase and camel_case
    label: 'Department', id:string, name:string // Academic departments at the London School of Economics; 'id' property is the name of the department in lowercase and camel_case
    label: 'Office', id:string, name:string, location:string // Offices or administrative units within buildings; 'id' property is the name of the office in lowercase and camel_case


2. Next, generate each relationship as triples of head, relationship, and tail. To refer to the head and tail entity, use their respective 'id' property.
    Relationship types:
    faculty[IS_MEMBER_OF]department
    event[IS_HOSTED_BY]faculty
    event[IS_RUN_BY]department
    course[IS_RUN_BY]department
    course[IS_TAUGHT_BY]faculty
    research_project[IS_LED_BY]faculty
    research_projects[IS_ASSOCIATED_WITH]department
    group[IS_AFFECTED_BY]policy
    procedure[IS_ASSOCIATED_WITH]policy
    policy[IS_ENFORCED_BY]committee
    researcher[CONDUCTS]researchProject
    researchProject[IS_ASSOCIATED_WITH]department
    researchProject[RESULTS_IN]publication
    researchProject[IS_FUNDED_BY]fundingSource
    publication[IS_AUTHORED_BY]researcher
    researchProject[IS_GOVERNED_BY]policy
    headOfSociety[LEADS]society
    society[USES]facility
    event[IS_HOSTED_BY]society
    event[TAKES_PLACE_AT]facility
    support[IS_PROVIDED_BY]facility
    department[IS_LOCATED_IN]building
    office[IS_LOCATED_IN]building
    facility[IS_PART_OF]building

NOTE: Not all documents will have these entities and relationships. If the document does not seem to have aforementioned entities and relationships, DO NOT RECORD THEM.

The output should look like:
{
    "entities": [{"label": "Department", "id": string, "name": string}],
    "relationships": ["faculty[IS_MEMBER_OF]department", "event[IS_RUN_BY]department", "course[IS_RUN_BY]department", "research_projects[IS_ASSOCIATED_WITH]department"]
}

Case Sheet:
$ctext
'''

In [59]:
async def main():
    df_content = df["content"].to_list()  # Call the method to get the list
    department_process_prompt = "Your template here"  # Define your prompt template

    result = await extract_entities_relationships(df_content, department_process_prompt)

    with open("academic_data.json", "w") as f:
        json.dump(result, f)

# Run the main function
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Running pipeline for 21521 files
Extracting entities and relationships for file number: 0
Extracting entities and relationships for file number: 1
Extracting entities and relationships for file number: 2
Extracting entities and relationships for file number: 3
Extracting entities and relationships for file number: 4
Extracting entities and relationships for file number: 5
Extracting entities and relationships for file number: 6
Extracting entities and relationships for file number: 7
Extracting entities and relationships for file number: 8
Extracting entities and relationships for file number: 9
Extracting entities and relationships for file number: 10


KeyboardInterrupt: 

Extracting entities and relationships for file number: 11
Extracting entities and relationships for file number: 12
Extracting entities and relationships for file number: 13
Extracting entities and relationships for file number: 14
Extracting entities and relationships for file number: 15


## 3. Neo4j Cypher Generation

In [2]:
# Function takes json-obejcts of entities and relationships and generates a cypher query for creating those entities

def generate_cyper(json_obj):
    e_statements = []
    r_statements = []

    e_label_map = {}
    # loop thro
    for i, obj in enumerate(json_obj):
        print(f"Generating cypher for file {i+1} of {(len(json_obj))}")
        for entity in obj['entities']:
            label = entity['label']
            id = entity['id']
            id = id.replace("-", "").replace("_","")
            properties = {k: v for k, v in entity.items() if k not in ['label', 'id']}

            cypher = f"MERGE (n:{label} {{id: '{entity_id}'}})"
            if properties:
                props_string = ', '.join[(f'n.{key} - "{val}"' for key, val in properties.items())]
                cypher += f"ON CREATE SET {props_str}"
            e_statements.append(cypher)
            e_label_map[id] = label

        for rs in obj['relationships']:
            src_id, rs_type, tgt_id = rs.split("|")
            src_id = src_id.replace("-", "").replace("_","")
            tgt_id = tgt_id.replace("-", "").replace("_","")

            src_label = e_label_map[src_id]
            tgt_label = e_label_map[tgt_id]

            cypher = {f'MERGE (a:{src_label} {{id: "{src_id}"}})) MERGE (b: {tgt_label} {{id: "{tgt_id}}})) MERGE (a) - [:{rs_type}] -> (b)'}
            r_statements.append(cypher)
    
    with open("cyphers.txt", "w") as outfile:
        outfile.write("\n".join(e_statements + r_statements))

    return e_statements + r_statements 

# Final function to bring all the steps together 

def ingestion_pipeline(folders):
    entites_relationships = []
    # Extracting the entities and relationships from each folder, append into one json_object
    for key, value in folders.items():
        extract_entities_relationships(key, value)

    #generate and execute cypher statements 
    cypher_statements = generate_cypher(entities_relationships)
    for i, stat in enumerate(cypher_statements):
        print(f"Executing cypher statement {i+1} of {len(cypher_statements)}")
        try:
            gds.execute_query(stat)
        except Exception as e:
            with open("failed_statements.txt", "w"):
                f.write(f"{stat} - Exception: {e}\n")