## 1. Extract data from the existing database

In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
from tqdm import tqdm

In [9]:
#Load credentials

db_username = os.getenv('POSTGRES_USERNAME')
db_password = os.getenv('POSTGRES_PASSWORD')
db_host = os.getenv('POSTGRES_HOST')
db_port = os.getenv('POSTGRES_PORT')
db_name = os.getenv('POSTGRES_DATABASE')
DATABASE_URL = os.getenv('DATABASE_URL')

In [3]:
%load_ext sql
%sql postgresql://chatlse:chatlse@158.143.74.137/chatlse

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [4]:
df = %sql SELECT * FROM lse_doc # Save all rows and columns to df 
df = df.DataFrame() #Turn it into a pandas dataframe 

In [8]:
content = df["content"] #Save everything in the "content" column to a variable

## 2. Prompt engineering for entity and relations extraction 

In [9]:
#department_process_prompt is to handle academics-related entities and relationships

department_process_prompt = '''From the unstructured data provided, extract the following Entities and relationships described in the mentioned format.
0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
    'id' property of each enetity must be alphanumeric and must be unique among the entities. You will be referring to this property to define the relationships between entities.
    Entity types:
    label: 'Department', id:string, name:string // Academic departments at the London School of Economics, referred to as "Department of ____"; 'id' property is the name of the department, in lowercase and camel-case.
    label: 'Course', id:string, name:string, course_code:string, prerequisites:string, semester:string // Academic courses taught at the London School of Economics; 'id' property is the name of the course, in lowercase and camel-case
    label: 'Faculty', id:string, name:string // Professors, assistant professors, teachers, fellows, tutors, directors, lecturers, and chairs associated with the London School of Economics; 'id' property is the full-name of the faculty member, in lowercase and camel-case
    label: 'Event', id:string, name:string, description:string, date:string // Seminars, workshops, talks, and other events involving the London School of Economics; 'id' property is the name of the event, in lowercase and camel-case
    label: 'Research Project', id:string, name:string, description:string // Research projects led by faculty associated with the London School of Economics; 'id' property is the name of the research project, in lowercase and camel_case


2. Next, generate each relationship as triples of head, relationship, and tail. To refer to the head and tail entity, use their respective 'id' property.
    Relationship types:
    faculty[IS_MEMBER_OF]department
    event[IS_HOSTED_BY]faculty
    event[IS_RUN_BY]department
    course[IS_RUN_BY]department
    course[IS_TAUGHT_BY]faculty
    research_project[IS_LED_BY]faculty
    research_projects[IS_ASSOCIATED_WITH]department

The output should look like:
{
    "entities": [{"label": "Department", "id": string, "name": string}],
    "relationships": ["faculty[IS_MEMBER_OF]department", "event[IS_RUN_BY]department", "course[IS_RUN_BY]department", "research_projects[IS_ASSOCIATED_WITH]department"]
}

Case Sheet:
$ctext
'''

#administration_process_prompt is to handle bureaucratic and regulatory entities and relationships

administration_process_prompt = '''From the unstructured data provided, extract the following Entities and relationships described in the mentioned format.
0. ALWAYS FINISH THE OUTPUT. Never send partial responses.
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
    'id' property of each entity must be alphanumeric and must be unique among the entities. You will be referring to this property to define the relationships between entities.
    Entity types:
    label: "Policy", id: string, name:string, summary:string // Policy related to administrative procedure at the London School of Economics; 'id' property is the name of the policy, in lowercase and camel_case
    label: "Procedure", id:string, content:string, summary:string // Procedure associated with certain policies established at the London School of Economics; 'id' property is the first four words of the summary, in lowercase and camel_case
    label: "Committee", id:string, name: string // Regulatory organisation or departments at the London School of Economics, referred to as "Department of ____"; 'id' propoerty is the name of the committee, in lowercase and camel-case
    label: "Group", id:string, name:string // Individuals or organisations; 'id' property is the name of the perosn, in lowercase and camel_case

2. Next, generate each relationship as triples of head, relationship, and tail. To refer to the head and tail entity, use their respective 'id' property.
    Relationship types:
    group[IS_AFFECTED_BY]policy
    procedure[IS_ASSOCIATED_WITH]policy
    policy[IS_ENFORCED_BY]committee
    
The output should look like:
{ "entities": [{"label": "Policy", "id": string, "name": string}],
    "relationships": ["group[IS_AFFECTED_BY]policy", "procedure[IS_ASSOCIATED_WITH]policy"]
}

Case Sheet: $ctext
''' 

#institutions_process_prompt is to handle institutions at the LSE (research facilities, summer school, careers centre, alumni centre)

institutions_process_prompt = 
research_process_prompt = 



In [None]:
import requests
import json

url = "http://localhost:11434/api/chat"

def llama3(prompt):
    data = {
        "model": "llama3",
        "messages": [
            {
                "role": "user",
                "content": prompt

            }
        ],
        "stream": False,
    }

    headers = {
        "Content-Type": "application/json"
    }

    response = requests.post(url, headers=headers, json=data)
    return response.json()["message"]["content"]



In [None]:
import pandas as pd
import requests
import json

# Define your API URL and headers
url = "http://localhost:11434/api/chat"
headers = {
    "Content-Type": "application/json"
}

def llama3(prompt, text):
    """Function to send prompts to the Llama3 model and return the content, including specific text."""
    full_prompt = prompt.replace('$ctext', text)
    data = {
        "model": "llama3",
        "messages": [{"role": "user", "content": full_prompt}],
        "stream": False,
    }
    response = requests.post(url, headers=headers, json=data)
    if response.status_code == 200:
        return response.json()["message"]["content"]
    else:
        return f"Error: {response.status_code}"

def process_contents(df):
    results = {}
    for idx, text in df['content'].iteritems():
        department_data = llama3(department_process_prompt, text)
        administration_data = llama3(administration_process_prompt, text)
        results[idx] = {
            'department_data': department_data,
            'administration_data': administration_data
        }
    return results

# Example DataFrame; replace with your actual DataFrame variable
df = pd.DataFrame({'content': ["Example text 1", "Example text 2"]})

# Process all entries in the DataFrame and store results in a dictionary
all_results = process_contents(df)

print(all_results)


## 3. Neo4j Cypher Generation