In [1]:
# docker run -p 7474:7474 -p 7687:7687 -d -e NEO4J_AUTH=neo4j/secretgraph neo4j:latest

In [2]:
## neo4j instance
# docker run -p 7474:7474 -p 7687:7687 -d -e NEO4J_AUTH=neo4j/zavmoadmin neo4j:latest  

In [1]:
from neo4j import GraphDatabase
import pandas as pd
import os
import json

In [2]:
ofqual_SSAs = json.load(open(os.path.join(os.getcwd(), "zavmo-api/zavmo/classification/data", "ofqual_SSAs.json")))

In [3]:
os.chdir("/Users/mumtaz/Documents/projects/zavmo/zavmo-api")

In [4]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [6]:

def get_batch_list(texts):
    batches = []
    current_batch = []
    current_batch_length = 0
    # Set a conservative max length per batch (characters)
    # OpenAI's text-embedding-3-small has an 8192 token limit
    # A rough estimate is ~4 characters per token, so ~32,000 chars should be safe
    max_batch_length = 30000
    
    for text in texts:
        text_length = len(text)
        
        # If a single text is too long, truncate it
        if text_length > max_batch_length:
            text = text[:max_batch_length]
            text_length = max_batch_length
            
        # If adding this text would exceed the batch limit, start a new batch
        if current_batch_length + text_length > max_batch_length and current_batch:
            batches.append(current_batch)
            current_batch = []
            current_batch_length = 0
            
        # Add the text to the current batch
        current_batch.append(text)
        current_batch_length += text_length
        
    # Add the last batch if it's not empty
    if current_batch:
        batches.append(current_batch)
        
    return batches

def get_batch_openai_embedding(texts: list, model="text-embedding-3-small", **kwargs):
    """
    Get embeddings of a batch of texts from OpenAI API.

    Args:
        texts (list): List of texts to get embeddings for.
        model (str): Model to use for embeddings.
        **kwargs: Additional arguments to pass to the OpenAI API.
    Returns:
        list[list]: List of embeddings of the texts.
    """
    text_batches = get_batch_list(texts)
    print(f"total batches: {len(text_batches)}")
    embeddings = []
    for text_batch in text_batches:
        response = client.embeddings.create(
            model=model,
            input=text_batch,
            **kwargs,
        )
        embeddings += [r.embedding for r in response.data]
    return embeddings

In [7]:
## zavmo neo4j instance
uri = "neo4j://51.20.45.38:7687"
username = "neo4j"
password = "zavmoadmin"

## local neo4j instance
uri = "neo4j://localhost:7687"
username = "neo4j"
password = "secretgraph"

In [8]:
with GraphDatabase.driver(uri, auth=(username, password)) as driver:
    driver.verify_connectivity()
    print("Connection established.")

Connection established.


In [39]:
nos = pd.read_excel(r"/Users/mumtaz/Documents/projects/zavmo/zavmo-api/zavmo/classification/data/predicted/missed_nos_records_with_predictions.xlsx")
nos.rename(columns={'predicted_sub_SSA': 'sub_SSA'}, inplace=True)

In [10]:
ofqual = pd.read_csv("/Users/mumtaz/Documents/projects/zavmo/zavmo-api/docs/nos-ofqual/ofqual.csv")

## Testing NOS and OFQUAL for People Partner role

In [11]:
sample_ofqual = "601/5198/5, 600/5436/0, 603/6728/3, 603/2870/8, 601/5196/1, 601/6049/4, 603/2870/8, 603/4688/7, 601/8691/4, 603/0322/0".split(", ")
sample_ofqual = ofqual[ofqual['ofqual_id'].isin(sample_ofqual)].reset_index(drop=True)
sample_ofqual.shape

(26, 8)

In [40]:
sample_nos = "SFJCCBE4.1, INSML021, PROMPR4, SFJHD2, SFJCCBE2.1, INSML017".split(", ")
sample_nos = nos[nos['nos_id'].isin(sample_nos)].reset_index(drop=True)
sample_nos.shape

(6, 10)

In [13]:
from neomodel import StructuredNode, StringProperty, RelationshipTo, config, db, ArrayProperty, FloatProperty

# Configure the connection to Neo4j
config.DATABASE_URL = "bolt://neo4j:secretgraph@localhost:7687"

class SSA(StructuredNode):
    name = StringProperty(unique=True)
    sub_ssas = RelationshipTo('SubSSA', 'HAS_SUB_SSA')

class SubSSA(StructuredNode):
    name = StringProperty(unique=True)
    ssa = RelationshipTo(SSA, 'BELONGS_TO')
    nos_documents = RelationshipTo('NOS', 'HAS_NOS')
    ofqual_documents = RelationshipTo('OFQUAL', 'HAS_OFQUAL')

class OFQUAL(StructuredNode):
    ofqual_id = StringProperty(unique=True, required=True, primary_key=True)
    unit_id = StringProperty(unique=True) 
    title = StringProperty(unique=True)
    overview = StringProperty()
    description = StringProperty()
    learning_outcomes = StringProperty()
    embedding = ArrayProperty(base_property=FloatProperty())
    
    sub_ssa = RelationshipTo(SubSSA, 'BELONGS_TO')
    ssa = RelationshipTo(SSA, 'BELONGS_TO')

class NOS(StructuredNode):
    nos_id = StringProperty(unique=True, required=True, primary_key=True)
    title = StringProperty()
    overview = StringProperty()
    performance_criteria = StringProperty()
    knowledge_criteria = StringProperty()
    nos_industry = StringProperty()
    keywords = StringProperty()
    relevant_roles = StringProperty()
    embedding = ArrayProperty(base_property=FloatProperty())
    
    sub_ssa = RelationshipTo(SubSSA, 'BELONGS_TO')
    ssa = RelationshipTo(SSA, 'BELONGS_TO')

    ofquals = RelationshipTo("OFQUAL", "RELATED_TO")


## SSA and Sub_SSA - Nodes Creation

In [14]:
category_mapping = {
    sub_category: parent_category
        for category_dict in ofqual_SSAs 
        for parent_category, sub_categories in category_dict.items()
        for sub_category in sub_categories
}

In [15]:
SubSSAs = [(key, value) for key, value in category_mapping.items()]
SSAs    = [list(ofqual_SSA.keys())[0] for ofqual_SSA in ofqual_SSAs]

In [20]:
len(SSAs), len(SubSSAs)

(15, 50)

In [30]:
ssa_nodes = {}
for name in SSAs:
    ssa = SSA(name=name).save()
    ssa_nodes[name] = ssa

In [31]:
# Creating SubSSA nodes and linking them to SSA
sub_ssa_nodes = {}
for sub_ssa_name, ssa_name in SubSSAs:
    sub_ssa = SubSSA(name=sub_ssa_name).save()
    sub_ssa.ssa.connect(ssa_nodes[ssa_name])
    sub_ssa_nodes[sub_ssa_name] = sub_ssa

print("SSA and SubSSA nodes created successfully!")

SSA and SubSSA nodes created successfully!


In [32]:
from neomodel import db

class VectorIndex:
    def __init__(self, node, index_name="test_embedding_index", dimension=1536, similarity_function="cosine"):
        self.index_name = index_name
        self.dimension = dimension
        self.similarity_function = similarity_function
        self.node = node
    
    def index_exists(self):
        """Checks if the vector index exists."""
        query = """
        SHOW INDEXES
        """
        results, _ = db.cypher_query(query)
        
        # Extract the index names from the results
        index_names = [row[0] for row in results]  # Adjust this based on Neo4j version
        return self.index_name in index_names

    def list_vector_indexes(self):
        """Lists all vector indexes."""
        query = """
        SHOW INDEXES
        """
        results, _ = db.cypher_query(query)
        return results

    def create_vector_index(self):
        """Creates a vector index if it does not exist."""

        query = f"""
        CREATE VECTOR INDEX {self.index_name} 
        FOR (n:{self.node}) 
        ON (n.embedding) 
        OPTIONS {{indexConfig: {{`vector.dimensions`: {self.dimension}, `vector.similarity_function`: "{self.similarity_function}"}}}};
        """
        try:
            db.cypher_query(query)
            print(f"Vector index '{self.index_name}' created successfully.")
        except Exception as e:
            print(f"")

    def delete_vector_index(self):
        """Deletes the vector index if it exists."""
        query = f"DROP INDEX {self.index_name};"
        try:
            db.cypher_query(query)
            print(f"Vector index '{self.index_name}' deleted successfully.")
        except Exception as e:
            print(f"Error deleting vector index: {e}")
    
    def retrieve_nos(self, query_embedding, top_k=5):
        query = f"""
            CALL db.index.vector.queryNodes('{self.index_name}', $top_k, $query_embedding) 
            YIELD node, score
            OPTIONAL MATCH (node)-[:BELONGS_TO]->(ssa:SSA)
            OPTIONAL MATCH (node)-[:BELONGS_TO]->(subssa:SubSSA)
            OPTIONAL MATCH (node)-[:RELATED_TO]->(ofqual:OFQUAL)
            RETURN 
                node.nos_id AS nos_id, 
                node.title AS title, 
                COLLECT(ofqual.ofqual_id) AS ofqual_ids, 
                COLLECT(ofqual.unit_id) AS ofqual_units,
                COLLECT(ofqual.learning_outcomes) AS ofqual_learning_outcomes,
                score
            ORDER BY score DESC
        """

        result, columns = db.cypher_query(query, {"query_embedding": query_embedding, "top_k": top_k})
        
        formatted_result = [dict(zip(columns, row)) for row in result]
        
        return formatted_result

    def drop_all_vector_records(self,):
        """
        Deletes all records of a given Neomodel class from the Neo4j database.
        """
        query = f"MATCH (n:{self.node}) DETACH DELETE n"
        db.cypher_query(query)
        print(f"All records from '{self.node}' have been deleted, but the index remains intact.")


## OFQUAL - Nodes Creation, Database Ingestion, Vector Index Creation

In [33]:
def ingest_ofqual_to_db(ofqual_list):
    with db.transaction:
        
        for ofqual in ofqual_list:
            ssa_node = SSA.nodes.get(name=ofqual["SSA"])
            sub_ssa_node = SubSSA.nodes.get(name=ofqual["sub_SSA"])
            ofqual_node = OFQUAL(
                ofqual_id=ofqual["ofqual_id"],
                unit_id=ofqual["id"],
                title=ofqual["title"],
                overview=ofqual["overview"],
                description=ofqual["description"],
                learning_outcomes=ofqual["learning_outcomes"],
                embedding=ofqual["embedding"]
            ).save()

            # Link OFQUAL to SSA and SubSSA
            ofqual_node.ssa.connect(ssa_node)
            ofqual_node.sub_ssa.connect(sub_ssa_node)

In [34]:
def store_ofqual():
    ofqual_list = sample_ofqual.to_dict(orient="records")
    ofqual_texts_for_embeddings = [f"Sector Subject Area: {ofqual['SSA']}\n\nSub Sector Subject Area: {ofqual['sub_SSA']}\n\nTitle: {ofqual['title']}\n\nDescription: {ofqual['description']}\n\nOverview: {ofqual['overview']}" for ofqual in ofqual_list]
    
    embeddings = get_batch_openai_embedding(ofqual_texts_for_embeddings)

    sample_ofqual['embedding'] = embeddings
    ingest_ofqual_to_db(sample_ofqual.to_dict(orient="records"))

In [35]:
ofqual_index = VectorIndex("OFQUAL", index_name="ofqual_embedding_index")
# ofqual_index.delete_vector_index()
ofqual_index.create_vector_index()
# ofqual_index.index_exists()
store_ofqual()


total batches: 1


In [36]:
db.cypher_query("MATCH (n:OFQUAL) WHERE n.embedding IS NOT NULL RETURN COUNT(n) AS indexed_documents;")

([[26]], ['indexed_documents'])

In [37]:
sample_ofqual.shape

(26, 9)

## NOS Nodes Creation, Database Ingestion, Vector Index Creation

In [30]:
# sample_nos['sub_SSA'] = "Business management"
# sample_nos['SSA'] = "Business, Administration and Law"
# sample_nos.shape

(6, 11)

In [41]:
sample_nos.columns

Index(['nos_id', 'industry', 'title', 'overview', 'performance_criteria',
       'knowledge_understanding', 'keywords', 'relevant_roles', 'SSA',
       'sub_SSA'],
      dtype='object')

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_relevant_ofquals(nos_node):
    """Find relevant Ofquals based on SSA, Sub-SSA, and embedding similarity."""
    
    ssa_node = nos_node.ssa.single()
    sub_ssa_node = nos_node.sub_ssa.single()
    
    if not ssa_node or not sub_ssa_node:
        return []

    ofqual_candidates = []
    for ofqual in OFQUAL.nodes.all():
        if ofqual.ssa.single() == ssa_node and ofqual.sub_ssa.single() == sub_ssa_node:
            ofqual_candidates.append(ofqual)
    
    if ofqual_candidates:
        nos_embedding =  np.array(nos_node.embedding).reshape(1, -1) 
        print(f"Embeddings present for NOS: {nos_node.nos_id} and found {len(ofqual_candidates)} candidates")
        best_matches = []
        for ofqual in ofqual_candidates:
            ofqual_embedding =  np.array(ofqual.embedding).reshape(1, -1) 
            similarity = cosine_similarity(nos_embedding, ofqual_embedding)[0][0]
            # if similarity > 0.75:  # Set a relevance threshold
            best_matches.append((ofqual, similarity))
        
        # Sort by highest similarity
        best_matches.sort(key=lambda x: x[1], reverse=True)
        return [match[0] for match in best_matches][:10]
    else:
        print(f"No ofqual candidates found for NOS: {nos_node.nos_id}")
    
    return list(ofqual_candidates)  # If no embeddings, return SSA-matched Ofquals


def ingest_nos_to_db_with_ofqual(nos_list):
    with db.transaction:
        for nos in nos_list:
            # Get SSA & SubSSA nodes
            ssa_node = SSA.nodes.get(name=nos["SSA"])
            sub_ssa_node = SubSSA.nodes.get(name=nos["sub_SSA"])
            if not ssa_node or not sub_ssa_node:
                continue
            
            nos_node = NOS(
                    nos_id=nos["nos_id"],
                    title=nos["title"],
                    overview=nos["overview"],
                    performance_criteria=nos["performance_criteria"],
                    knowledge_criteria=nos["knowledge_understanding"],
                    nos_industry=nos["industry"],
                    keywords=nos["keywords"],
                    relevant_roles=nos["relevant_roles"],
                    embedding=nos["embedding"]
                ).save()
                
            nos_node.ssa.connect(ssa_node)
            nos_node.sub_ssa.connect(sub_ssa_node)
            
            # # Find relevant Ofquals
            relevant_ofquals = find_relevant_ofquals(nos_node)
            for ofqual in relevant_ofquals:
                if not nos_node.ofquals.is_connected(ofqual):
                    nos_node.ofquals.connect(ofqual)

In [43]:
def store_nos_with_ofqual():
    nos_list = sample_nos.to_dict(orient="records")

    nos_texts_for_embeddings = [f"Sector Subject Area: {nos['SSA']}\n\nSub Sector Subject Area: {nos['sub_SSA']}\n\nIndustry: {nos['industry']}\n\nTitle: {nos['title']}\n\nOverview:{nos['overview']}"
                                for nos in nos_list]
    
    embeddings = get_batch_openai_embedding(nos_texts_for_embeddings)
    sample_nos['embedding'] = embeddings
    ingest_nos_to_db_with_ofqual(sample_nos.to_dict(orient="records"))

In [44]:
sample_nos.shape

(6, 10)

In [45]:
nos_index = VectorIndex("NOS", index_name="nos_embedding_index")
# nos_index.delete_vector_index()
nos_index.create_vector_index()
# # nos_index.index_exists()
store_nos_with_ofqual()

Vector index 'nos_embedding_index' created successfully.
total batches: 1




Embeddings present for NOS: INSML017 and found 4 candidates
Embeddings present for NOS: INSML021 and found 4 candidates
No ofqual candidates found for NOS: PROMPR4
Embeddings present for NOS: SFJCCBE2.1 and found 4 candidates
Embeddings present for NOS: SFJCCBE4.1 and found 4 candidates
No ofqual candidates found for NOS: SFJHD2


In [46]:
db.cypher_query("MATCH (n:NOS) WHERE n.embedding IS NOT NULL RETURN COUNT(n) AS indexed_documents;")

([[6]], ['indexed_documents'])

In [42]:
# nos_index.drop_all_vector_records()

All records from 'NOS' have been deleted, but the index remains intact.


## Querying

In [47]:
query_text = """Sector Subject Area: Business, Administration and Law
Sub Sector Subject Area: Business management

The People Partner works within the People Partnering team to support the delivery of the overarching people plan of the Business Unit and Centrica people strategy.
The People Partner works at pace to diagnose, design and deliver the best People solutions required for delivering the people plan, linking in with People Consultants, Centres of Excellence and other HR shared services to leverage key expertise / ensure consistent delivery.
Working as part of a flexible people partnering team ensuring business intimacy & commerciality to provide business support where needed. Providing professional end to end HR expertise, coaching & support to business leaders."""
query_embedding = get_batch_openai_embedding([query_text])[0]

total batches: 1


In [48]:
nos_index = VectorIndex("NOS", index_name="nos_embedding_index")
results = nos_index.retrieve_nos(query_embedding, top_k=10)



In [49]:
results

[{'nos_id': 'SFJCCBE4.1',
  'title': 'Implement Change',
  'ofqual_ids': ['603/2870/8', '603/2870/8', '603/2870/8', '603/2870/8'],
  'ofqual_units': ['L/616/8107', 'J/616/8106', 'L/616/8110', 'Y/616/8109'],
  'ofqual_learning_outcomes': ['[{"description": "Understand the principles of business strategy", "assessment_criteria": ["Explain why strategy is important to a business", "Critically compare distinct types of business strategies", "Evaluate how organisational structure and culture influence business strategy", "Compare different strategies and how these impact on an organisation"]}, {"description": "Use a strategic planning approach to inform business strategy", "assessment_criteria": ["Differentiate between operational, tactical and strategic planning", "Conduct an internal and external environmental scan", "Determine organisation\\u2019s strategic capacity", "Create a strategy framework to communicate vision, values and goals", "Recommend strategic actions based on the applicat

## Delete all nodes from Database

In [29]:
# Clear all nodes and relationships
db.cypher_query("MATCH (n) DETACH DELETE n")

print("All nodes and relationships have been deleted from the database.")


All nodes and relationships have been deleted from the database.
