In [1]:
import os
os.chdir('/Users/mumtaz/Documents/projects/zavmo/zavmo-api/zavmo')

In [2]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
pd.set_option('display.max_columns',500)

In [3]:
from helpers.chat import get_batch_openai_embedding

### Nos Prep

In [4]:
nos_df = pd.read_csv("../docs/rgcn/nos.csv")
print(nos_df.shape)

(14157, 8)


In [5]:
nos_df = pd.read_excel("/Users/mumtaz/Documents/projects/zavmo/missed_nos_records_with_predictions.xlsx")
nos_df.shape

(14157, 10)

In [6]:
nos_df.columns

Index(['nos_id', 'industry', 'title', 'overview', 'performance_criteria',
       'knowledge_understanding', 'keywords', 'relevant_roles', 'SSA',
       'predicted_sub_SSA'],
      dtype='object')

In [7]:
nos_texts = [f"{row['title']} {row['overview']} {row['performance_criteria']} {row['knowledge_understanding']} {row['keywords']} {row['relevant_roles']}" for idx, row in nos_df.iterrows()]

embeddings_nos = get_batch_openai_embedding(nos_texts)

### Ofqual Prep

In [8]:
ofqual_df = pd.read_csv("../docs/rgcn/ofqual_units.csv").head(10000).drop_duplicates(subset=['unit_id']).reset_index(drop=True)
print(f"Ofqual Units: {ofqual_df.unit_id.nunique()}")
print(f"Ofqual IDs: {ofqual_df.ofqual_id.nunique()}")

Ofqual Units: 5357
Ofqual IDs: 592


In [9]:
len(ofqual_df['unit_id'].unique())

5358

In [10]:
ofqual_df.shape

(5358, 15)

In [11]:
ofqual_texts = [f"{row['sector_subject_area']} {row['overview']} {row['unit_title']} {row['unit_description']} {row['unit_learning_outcomes']} {row['qualification_level']}" for idx, row in ofqual_df.iterrows()]

embeddings_ofqual = get_batch_openai_embedding(ofqual_texts)

### Get top-10 Ofqual Units with Cosine Sim >= t

In [12]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time

In [13]:
from neomodel import (
    StructuredNode, 
    StringProperty, 
    FloatProperty,
    RelationshipTo,
    RelationshipFrom,
    ArrayProperty
)

class NOSNode(StructuredNode):
    nos_id   = StringProperty(unique_index=True, required=True)    
    industry = StringProperty()
    title = StringProperty()
    overview = StringProperty()
    performance_criteria = StringProperty()
    knowledge_understanding = StringProperty()
    keywords = StringProperty()
    relevant_roles = StringProperty()
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    ofqual_units = RelationshipTo('OFQUALUnit', 'MAPS_TO')

class OFQUALUnit(StructuredNode):
    ofqual_id = StringProperty(index=True)
    unit_id = StringProperty(unique_index=True, required=True)
    
    overview = StringProperty()
    unit_title = StringProperty()
    unit_description = StringProperty()
    unit_learning_outcomes = StringProperty()
    qualification_type = StringProperty()
    qualification_level = StringProperty()
    assessment_methods = StringProperty()
    sector_subject_area = StringProperty()
    awarding_organisation = StringProperty()
    total_credits = FloatProperty()
    guided_learning_hours = FloatProperty()
    total_qualification_time = FloatProperty()
    awarding_organization = StringProperty()
    
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    nos_items = RelationshipFrom('NOSNode', 'MAPS_TO')

In [14]:
# Configure neomodel
from neomodel import config
DATABASE_URL = f'bolt://{os.getenv("NEO4J_USERNAME")}:{os.getenv("NEO4J_PASSWORD")}@{os.getenv("NEO4J_URI")}'
# DATABASE_URL = "bolt://neo4j:secretgraph@localhost:7687"
config.DATABASE_URL = DATABASE_URL

In [15]:
DATABASE_URL

'bolt://neo4j:zavmoadmin@51.20.45.38'

In [16]:
from neomodel import db
print(db.url)  # Should print "bolt://neo4j:password@localhost:7687"

None


# Create NOS nodes with embeddings

In [19]:
# Prepare the data for batch creation
nos_data = []
for idx, row in nos_df.iterrows():
    nos_data.append({
        'nos_id': row['nos_id'],
        'industry': row['industry'],
        'title': row['title'],
        'overview': row['overview'],
        'performance_criteria': row['performance_criteria'],
        'knowledge_understanding': row['knowledge_understanding'],
        'keywords': row['keywords'],
        'relevant_roles': row['relevant_roles'],
        'embedding': embeddings_nos[idx]
        })


In [22]:
import tqdm

In [23]:
BATCH_SIZE = 1000

# Process NOS nodes in batches
for i in tqdm.tqdm(range(0, len(nos_data), BATCH_SIZE)):
    batch = nos_data[i:i + BATCH_SIZE]
    with db.transaction:
        batch_nodes = NOSNode.create_or_update(*batch)
    

100%|██████████| 15/15 [07:44<00:00, 30.96s/it]


# Create OFQUAL nodes with embeddings

In [24]:
# Similarly for OFQUAL nodes
ofqual_data = []
for idx, row in ofqual_df.iterrows():
    ofqual_data.append({
        'ofqual_id': row['ofqual_id'],
        'unit_id': row['unit_id'],
        'overview': row['overview'],
        'unit_title': row['unit_title'],
        'unit_description': row['unit_description'],
        'unit_learning_outcomes': row['unit_learning_outcomes'],
        'qualification_type': row['qualification_type'],
        'qualification_level': row['qualification_level'],
        'assessment_methods': row['assessment_methods'],
        'sector_subject_area': row['sector_subject_area'],
        'awarding_organisation': row['awarding_organisation'],
        'total_credits': row['total_credits'],
        'guided_learning_hours': row['guided_learning_hours'],
        'total_qualification_time': row['total_qualification_time'],
        'awarding_organization': row['awarding_organization'],
        'embedding': embeddings_ofqual[idx]
    })

In [25]:

BATCH_SIZE = 1000

# Process OFQUAL nodes in batches
for i in tqdm.tqdm(range(0, len(ofqual_data), BATCH_SIZE)):
    batch = ofqual_data[i:i + BATCH_SIZE]
    with db.transaction:
        batch_nodes = OFQUALUnit.create_or_update(*batch)


100%|██████████| 6/6 [02:09<00:00, 21.61s/it]


In [26]:
# Convert all embeddings to numpy arrays at once
nos_embeddings_array = np.vstack(embeddings_nos)
ofqual_embeddings_array = np.vstack(embeddings_ofqual)

# Calculate all cosine similarities at once
print("Calculating similarities and creating relationships...")
all_similarities = cosine_similarity(nos_embeddings_array, ofqual_embeddings_array)

# Track relationship counts
relationship_count = 0

Calculating similarities and creating relationships...


In [27]:
threshold = 0.5
top_k = 10

In [None]:
# Process similarities for each NOS document and create relationships directly
for nos_idx, similarities in enumerate(all_similarities):
    # Get indices where similarity is above threshold
    above_threshold_indices = np.where(similarities >= threshold)[0]
    
    if len(above_threshold_indices) > 0:
        # Get the similarity scores for these indices
        scores = similarities[above_threshold_indices]
        
        # If we have more matches than top_k, find the indices of top_k highest scores
        if len(above_threshold_indices) > top_k:
            # Get indices of top_k highest scores
            top_indices = np.argsort(scores)[-top_k:]
            # Get the corresponding ofqual indices and scores
            selected_ofqual_indices = above_threshold_indices[top_indices]
        else:
            # Use all matches
            selected_ofqual_indices = above_threshold_indices
        
        # Get the NOS node
        nos_row  = nos_df.iloc[nos_idx]
        nos_node = NOSNode.nodes.get(nos_id=nos_row['nos_id'])

        # Check if the NOS node already has connections to OFQUAL units
        existing_connections = len(list(nos_node.ofqual_units))
        
        # Create relationships directly
        for ofqual_idx in selected_ofqual_indices:
            ofqual_row = ofqual_df.iloc[ofqual_idx]
            ofqual_node = OFQUALUnit.nodes.get(unit_id=ofqual_row['unit_id'])
            
            # Connect the nodes
            nos_node.ofqual_units.connect(ofqual_node)
            relationship_count += 1
            
            # Print progress every 1000 relationships
            if relationship_count % 1000 == 0:
                print(f"Created {relationship_count} relationships so far...")

Created 1000 relationships so far...


In [18]:
# Create vector indexes
from neomodel import db

print("Creating vector indexes...")
# Create vector index for NOS nodes
db.cypher_query("""
CREATE VECTOR INDEX nos_vector_index IF NOT EXISTS
FOR (n:NOSNode)
ON (n.embedding)
OPTIONS {
    indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'
    }
}
""")

# Create vector index for OFQUAL nodes
db.cypher_query("""
CREATE VECTOR INDEX ofqual_vector_index IF NOT EXISTS
FOR (n:OFQUALUnit)
ON (n.embedding)
OPTIONS {
    indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'
    }
}
""")

Creating vector indexes...


([], [])

#### Number of documents ingested in nos_vector_index

In [62]:
db.cypher_query("MATCH (n:NOSNode) WHERE n.embedding IS NOT NULL RETURN COUNT(n) AS indexed_documents;")

([[0]], ['indexed_documents'])

#### Number of documents ingested in ofqual_vector_index

In [63]:
db.cypher_query("MATCH (n:OFQUALUnit) WHERE n.embedding IS NOT NULL RETURN COUNT(n) AS indexed_documents;")

([[0]], ['indexed_documents'])

## Retrieval

In [52]:
def retrieve_nos(index_name, query_embedding, top_k=5):
        query = f"""
            CALL db.index.vector.queryNodes('{index_name}', $top_k, $query_embedding) 
            YIELD node, score
            OPTIONAL MATCH (node)-[:MAPS_TO]->(ofqual:OFQUALUnit)
            RETURN 
                node.nos_id AS nos_id, 
                node.title AS title, 
                node.performance_criteria AS performance_criteria,
                node.knowledge_understanding AS knowledge_understanding,
                score,
                ofqual.ofqual_id AS ofqual_id,
                ofqual.unit_id AS unit_id,
                ofqual.unit_title AS unit_title,
                ofqual.unit_description AS unit_description,
                ofqual.unit_learning_outcomes AS unit_learning_outcomes,
                ofqual.qualification_type AS qualification_type,
                ofqual.qualification_level AS qualification_level,
                ofqual.awarding_organization AS awarding_organization,
                ofqual.sector_subject_area AS sector_subject_area
            ORDER BY score DESC
        """

        result, columns = db.cypher_query(query, {"query_embedding": query_embedding, "top_k": top_k})
        
        formatted_result = [dict(zip(columns, row)) for row in result]
        
        return formatted_result

In [None]:
query_text = """The Ethics & Compliance function provides assurance that Centrica operates in a manner consistent with its legal and regulatory obligations. 
The Energy Compliance team is responsible for establishing and maintaining a robust compliance framework for energy and ensuring the governance structure within which the framework sits is effective."""

query_text = "Ethics & Compliance professional in the energy sector, with a focus on establishing and maintaining compliance frameworks, regulatory compliance, and governance structures."
query_embedding = get_batch_openai_embedding([query_text])[0]

In [57]:
results = retrieve_nos('nos_vector_index', query_embedding, top_k=10)
results

[{'nos_id': 'CFASAL021',
  'title': 'Ensure compliance with legal, regulatory and ethical requirements',
  'performance_criteria': '- Source relevant information on legal, regulatory, and ethical requirements.  \n- Monitor and evaluate the impact of these requirements on operations.  \n- Develop effective policies and procedures to meet requirements.  \n- Ensure stakeholders understand and implement the policies.  \n- Monitor the application of policies and provide support.  \n- Foster openness regarding compliance issues.  \n- Identify and correct compliance failures.  \n- Analyze reasons for non-compliance and adjust policies accordingly.  \n- Report failures to relevant stakeholders.',
  'knowledge_understanding': '- Legal, regulatory, and ethical requirements in the sales sector.  \n- Sources of information and expertise related to compliance.  \n- Organisational policies concerning compliance.  \n- Procedures for addressing non-compliance.  \n- Current and emerging social concerns

In [None]:
nos_results = [res['nos_id'] for res in results]
nos_results

In [None]:
expected_results = "CFAGOR3, ASTFM324, LANEM17, CFAGOR5, FSPCOMP4, FSPCOMP16, CFAGOR6, LANEM18".split(', ')
[i for i in nos_results if i in expected_results]

In [None]:
for i in [i for i in nos_results if i in expected_results]:
    print(nos_results.index(i))