In [1]:
import os
os.chdir('/Users/mumtaz/Documents/projects/zavmo/zavmo-api/zavmo')

In [2]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
pd.set_option('display.max_columns',500)

In [3]:
from helpers.chat import get_batch_openai_embedding

### Nos Prep

In [4]:
nos_df = pd.read_csv("../docs/rgcn/nos.csv").head(1000)
print(nos_df.shape)

(1000, 8)


In [5]:
nos_texts = [f"{row['title']} {row['overview']} {row['performance_criteria']} {row['knowledge_understanding']} {row['keywords']} {row['relevant_roles']}" for idx, row in nos_df.iterrows()]

embeddings_nos = get_batch_openai_embedding(nos_texts)

### Ofqual Prep

In [45]:
ofqual_df = pd.read_csv("../docs/rgcn/ofqual_units.csv").head(10000).drop_duplicates(subset=['unit_id']).reset_index(drop=True)
print(f"Ofqual Units: {ofqual_df.unit_id.nunique()}")
print(f"Ofqual IDs: {ofqual_df.ofqual_id.nunique()}")

Ofqual Units: 5357
Ofqual IDs: 592


In [9]:
len(ofqual_df['unit_id'].unique())

5358

In [10]:
ofqual_df.shape

(5358, 15)

In [11]:
ofqual_texts = [f"{row['sector_subject_area']} {row['overview']} {row['unit_title']} {row['unit_description']} {row['unit_learning_outcomes']} {row['qualification_level']}" for idx, row in ofqual_df.iterrows()]

embeddings_ofqual = get_batch_openai_embedding(ofqual_texts)

### Get top-10 Ofqual Units with Cosine Sim >= t

In [12]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time

In [17]:
from neomodel import (
    StructuredNode, 
    StringProperty, 
    FloatProperty,
    RelationshipTo,
    RelationshipFrom,
    ArrayProperty
)

class NOSNode(StructuredNode):
    nos_id   = StringProperty(unique_index=True, required=True)    
    industry = StringProperty()
    title = StringProperty()
    overview = StringProperty()
    performance_criteria = StringProperty()
    knowledge_understanding = StringProperty()
    keywords = StringProperty()
    relevant_roles = StringProperty()
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    ofqual_units = RelationshipTo('OFQUALUnit', 'MAPS_TO')

class OFQUALUnit(StructuredNode):
    ofqual_id = StringProperty(index=True)
    unit_id = StringProperty(unique_index=True, required=True)
    
    overview = StringProperty()
    unit_title = StringProperty()
    unit_description = StringProperty()
    unit_learning_outcomes = StringProperty()
    qualification_type = StringProperty()
    qualification_level = StringProperty()
    assessment_methods = StringProperty()
    sector_subject_area = StringProperty()
    awarding_organisation = StringProperty()
    total_credits = FloatProperty()
    guided_learning_hours = FloatProperty()
    total_qualification_time = FloatProperty()
    awarding_organization = StringProperty()
    
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    nos_items = RelationshipFrom('NOSNode', 'MAPS_TO')

In [18]:
# Configure neomodel
from neomodel import config
DATABASE_URL = f'bolt://{os.getenv("NEO4J_USERNAME")}:{os.getenv("NEO4J_PASSWORD")}@{os.getenv("NEO4J_URI")}'
# DATABASE_URL = "bolt://neo4j:secretgraph@localhost:7687"
config.DATABASE_URL = DATABASE_URL

In [19]:
DATABASE_URL

'bolt://neo4j:zavmoadmin@51.20.45.38'

In [20]:
from neomodel import db
print(db.url)  # Should print "bolt://neo4j:password@localhost:7687"

bolt://neo4j:zavmoadmin@51.20.45.38


# Create NOS nodes with embeddings

In [14]:
# Prepare the data for batch creation
nos_data = []
for idx, row in nos_df.iterrows():
    nos_data.append({
        'nos_id': row['nos_id'],
        'industry': row['industry'],
        'title': row['title'],
        'overview': row['overview'],
        'performance_criteria': row['performance_criteria'],
        'knowledge_understanding': row['knowledge_understanding'],
        'keywords': row['keywords'],
        'relevant_roles': row['relevant_roles'],
        'embedding': embeddings_nos[idx]
    })

# Create all NOS nodes in a single transaction
with db.transaction:
    nos_nodes = NOSNode.create_or_update(*nos_data)

# Create OFQUAL nodes with embeddings

In [21]:
# Similarly for OFQUAL nodes
ofqual_data = []
for idx, row in ofqual_df.iterrows():
    ofqual_data.append({
        'ofqual_id': row['ofqual_id'],
        'unit_id': row['unit_id'],
        'overview': row['overview'],
        'unit_title': row['unit_title'],
        'unit_description': row['unit_description'],
        'unit_learning_outcomes': row['unit_learning_outcomes'],
        'qualification_type': row['qualification_type'],
        'qualification_level': row['qualification_level'],
        'assessment_methods': row['assessment_methods'],
        'sector_subject_area': row['sector_subject_area'],
        'awarding_organisation': row['awarding_organisation'],
        'total_credits': row['total_credits'],
        'guided_learning_hours': row['guided_learning_hours'],
        'total_qualification_time': row['total_qualification_time'],
        'awarding_organization': row['awarding_organization'],
        'embedding': embeddings_ofqual[idx]
    })

with db.transaction:
    ofqual_nodes = OFQUALUnit.create_or_update(*ofqual_data)

In [22]:
# Convert all embeddings to numpy arrays at once
nos_embeddings_array = np.vstack(embeddings_nos)
ofqual_embeddings_array = np.vstack(embeddings_ofqual)

# Calculate all cosine similarities at once
print("Calculating similarities and creating relationships...")
all_similarities = cosine_similarity(nos_embeddings_array, ofqual_embeddings_array)

# Track relationship counts
relationship_count = 0

Calculating similarities and creating relationships...


In [23]:
threshold = 0.5
top_k = 10

In [48]:
# Process similarities for each NOS document and create relationships directly
for nos_idx, similarities in enumerate(all_similarities):
    # Get indices where similarity is above threshold
    above_threshold_indices = np.where(similarities >= threshold)[0]
    
    if len(above_threshold_indices) > 0:
        # Get the similarity scores for these indices
        scores = similarities[above_threshold_indices]
        
        # If we have more matches than top_k, find the indices of top_k highest scores
        if len(above_threshold_indices) > top_k:
            # Get indices of top_k highest scores
            top_indices = np.argsort(scores)[-top_k:]
            # Get the corresponding ofqual indices and scores
            selected_ofqual_indices = above_threshold_indices[top_indices]
        else:
            # Use all matches
            selected_ofqual_indices = above_threshold_indices
        
        # Get the NOS node
        nos_row  = nos_df.iloc[nos_idx]
        nos_node = NOSNode.nodes.get(nos_id=nos_row['nos_id'])

        # Check if the NOS node already has connections to OFQUAL units
        existing_connections = len(list(nos_node.ofqual_units))
        
        # Create relationships directly
        for ofqual_idx in selected_ofqual_indices:
            ofqual_row = ofqual_df.iloc[ofqual_idx]
            ofqual_node = OFQUALUnit.nodes.get(unit_id=ofqual_row['unit_id'])
            
            # Connect the nodes
            nos_node.ofqual_units.connect(ofqual_node)
            relationship_count += 1
            
            # Print progress every 1000 relationships
            if relationship_count % 1000 == 0:
                print(f"Created {relationship_count} relationships so far...")

NOS CFAUE3 already has 10 connections. Not creating new connections...
NOS CFAUE6 already has 10 connections. Not creating new connections...
NOS IMIHR15 already has 10 connections. Not creating new connections...
NOS CFAUE7 already has 10 connections. Not creating new connections...
NOS ASTAA1 already has 10 connections. Not creating new connections...
NOS ASTACEA1 already has 10 connections. Not creating new connections...
NOS ASTAG12 already has 10 connections. Not creating new connections...
NOS ASTASPS34 already has 10 connections. Not creating new connections...
NOS ASTASPS35 already has 4 connections. Not creating new connections...
NOS ASTASPS36 already has 1 connections. Not creating new connections...
NOS ASTATT1 already has 10 connections. Not creating new connections...
NOS ASTATT10 already has 10 connections. Not creating new connections...
NOS ASTATT2 already has 10 connections. Not creating new connections...
NOS ASTATT3 already has 10 connections. Not creating new conne

In [25]:
# Create vector indexes
from neomodel import db

print("Creating vector indexes...")
# Create vector index for NOS nodes
db.cypher_query("""
CREATE VECTOR INDEX nos_vector_index IF NOT EXISTS
FOR (n:NOSNode)
ON (n.embedding)
OPTIONS {
    indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'
    }
}
""")

# Create vector index for OFQUAL nodes
db.cypher_query("""
CREATE VECTOR INDEX ofqual_vector_index IF NOT EXISTS
FOR (n:OFQUALUnit)
ON (n.embedding)
OPTIONS {
    indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'
    }
}
""")

Creating vector indexes...


([], [])

#### Number of documents ingested in nos_vector_index

In [26]:
db.cypher_query("MATCH (n:NOSNode) WHERE n.embedding IS NOT NULL RETURN COUNT(n) AS indexed_documents;")

([[1000]], ['indexed_documents'])

#### Number of documents ingested in ofqual_vector_index

In [27]:
db.cypher_query("MATCH (n:OFQUALUnit) WHERE n.embedding IS NOT NULL RETURN COUNT(n) AS indexed_documents;")

([[5358]], ['indexed_documents'])

## Retrieval

In [52]:
def retrieve_nos(index_name, query_embedding, top_k=5):
        query = f"""
            CALL db.index.vector.queryNodes('{index_name}', $top_k, $query_embedding) 
            YIELD node, score
            OPTIONAL MATCH (node)-[:MAPS_TO]->(ofqual:OFQUALUnit)
            RETURN 
                node.nos_id AS nos_id, 
                node.title AS title, 
                node.performance_criteria AS performance_criteria,
                node.knowledge_understanding AS knowledge_understanding,
                score,
                ofqual.ofqual_id AS ofqual_id,
                ofqual.unit_id AS unit_id,
                ofqual.unit_title AS unit_title,
                ofqual.unit_description AS unit_description,
                ofqual.unit_learning_outcomes AS unit_learning_outcomes,
                ofqual.qualification_type AS qualification_type,
                ofqual.qualification_level AS qualification_level,
                ofqual.awarding_organization AS awarding_organization,
                ofqual.sector_subject_area AS sector_subject_area
            ORDER BY score DESC
        """

        result, columns = db.cypher_query(query, {"query_embedding": query_embedding, "top_k": top_k})
        
        formatted_result = [dict(zip(columns, row)) for row in result]
        
        return formatted_result

In [53]:
query_text = """Business management

The People Partner works within the People Partnering team to support the delivery of the overarching people plan of the Business Unit and Centrica people strategy.
The People Partner works at pace to diagnose, design and deliver the best People solutions required for delivering the people plan, linking in with People Consultants, Centres of Excellence and other HR shared services to leverage key expertise / ensure consistent delivery.
Working as part of a flexible people partnering team ensuring business intimacy & commerciality to provide business support where needed. Providing professional end to end HR expertise, coaching & support to business leaders."""
query_embedding = get_batch_openai_embedding([query_text])[0]

In [54]:
results = retrieve_nos('nos_vector_index', query_embedding, top_k=10)
results

[{'nos_id': 'CFABAH112',
  'title': 'Support partnership working',
  'performance_criteria': '- Confirm corporate policies for third-party arrangements  \n- Identify potential third-party relationships  \n- Negotiate within job role authority and corporate policies  \n- Communicate benefits of arrangements to stakeholders  \n- Ensure compliance with legal and regulatory requirements',
  'knowledge_understanding': '- Corporate policies related to third-party arrangements  \n- Types of arrangements and relationships with third parties  \n- Negotiation methods and techniques  \n- Responsibilities and authority limits of the job role  \n- Methods for identifying and evaluating benefits and disadvantages  \n- Legal and regulatory requirements for partnerships  \n- Consequences of non-compliance with regulations',
  'score': 0.7665119171142578,
  'ofqual_id': '601/3686/8',
  'unit_id': 'D/506/2167',
  'unit_title': 'Use service partnerships to deliver customer service',
  'unit_description':