In [1]:
import os
os.chdir('/Users/mumtaz/Documents/projects/zavmo/zavmo-api/zavmo')

In [2]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
pd.set_option('display.max_columns',500)

In [3]:
from helpers.chat import get_batch_openai_embedding

### Nos Prep

In [4]:
nos_df = pd.read_csv("../docs/rgcn/nos.csv")
print(nos_df.shape)

(14157, 8)


In [None]:
nos_texts = [f"{row['title']} {row['overview']} {row['performance_criteria']} {row['knowledge_understanding']} {row['keywords']} {row['relevant_roles']}" for idx, row in nos_df.iterrows()]

embeddings_nos = get_batch_openai_embedding(nos_texts)

### Ofqual Prep

In [6]:
ofqual_df = pd.read_csv("../docs/rgcn/ofqual_units.csv").drop_duplicates(subset=['unit_id']).reset_index(drop=True)
print(f"Ofqual Units: {ofqual_df.unit_id.nunique()}")
print(f"Ofqual IDs: {ofqual_df.ofqual_id.nunique()}")

Ofqual Units: 14342
Ofqual IDs: 2182


In [7]:
len(ofqual_df['unit_id'].unique())

14343

In [8]:
ofqual_df.shape

(14343, 15)

In [9]:
ofqual_texts = [f"{row['sector_subject_area']} {row['overview']} {row['unit_title']} {row['unit_description']} {row['unit_learning_outcomes']} {row['qualification_level']}" for idx, row in ofqual_df.iterrows()]

embeddings_ofqual = get_batch_openai_embedding(ofqual_texts)

### Get top-10 Ofqual Units with Cosine Sim >= t

In [6]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
from neomodel import (
    StructuredNode, 
    StringProperty, 
    FloatProperty,
    RelationshipTo,
    RelationshipFrom,
    ArrayProperty
)

class NOSNode(StructuredNode):
    nos_id   = StringProperty(unique_index=True, required=True)    
    industry = StringProperty()
    title = StringProperty()
    overview = StringProperty()
    performance_criteria = StringProperty()
    knowledge_understanding = StringProperty()
    keywords = StringProperty()
    relevant_roles = StringProperty()
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    ofqual_units = RelationshipTo('OFQUALUnit', 'MAPS_TO')

class OFQUALUnit(StructuredNode):
    ofqual_id = StringProperty(index=True)
    unit_id = StringProperty(unique_index=True, required=True)
    
    overview = StringProperty()
    unit_title = StringProperty()
    unit_description = StringProperty()
    unit_learning_outcomes = StringProperty()
    qualification_type = StringProperty()
    qualification_level = StringProperty()
    assessment_methods = StringProperty()
    sector_subject_area = StringProperty()
    awarding_organisation = StringProperty()
    total_credits = FloatProperty()
    guided_learning_hours = FloatProperty()
    total_qualification_time = FloatProperty()
    awarding_organization = StringProperty()
    
    
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    nos_items = RelationshipFrom('NOSNode', 'MAPS_TO')

In [7]:
# Configure neomodel
from neomodel import config
DATABASE_URL = f'bolt://{os.getenv("NEO4J_USERNAME")}:{os.getenv("NEO4J_PASSWORD")}@{os.getenv("NEO4J_URI")}'
# DATABASE_URL = "bolt://neo4j:secretgraph@localhost:7687"
config.DATABASE_URL = DATABASE_URL

In [8]:
DATABASE_URL

'bolt://neo4j:zavmoadmin@51.20.45.38'

In [9]:
from neomodel import db
print(db.url)  # Should print "bolt://neo4j:password@localhost:7687"

None


# Create NOS nodes with embeddings

In [19]:
# Prepare the data for batch creation
nos_data = []
for idx, row in nos_df.iterrows():
    nos_data.append({
        'nos_id': row['nos_id'],
        'industry': row['industry'],
        'title': row['title'],
        'overview': row['overview'],
        'performance_criteria': row['performance_criteria'],
        'knowledge_understanding': row['knowledge_understanding'],
        'keywords': row['keywords'],
        'relevant_roles': row['relevant_roles'],
        'embedding': embeddings_nos[idx]
        })


In [20]:
import tqdm

In [21]:
BATCH_SIZE = 1000

# Process NOS nodes in batches
for i in tqdm.tqdm(range(0, len(nos_data), BATCH_SIZE)):
    batch = nos_data[i:i + BATCH_SIZE]
    with db.transaction:
        batch_nodes = NOSNode.create_or_update(*batch)
    

  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 15/15 [01:52<00:00,  7.50s/it]


# Create OFQUAL nodes with embeddings

In [22]:
# Similarly for OFQUAL nodes
ofqual_data = []
for idx, row in ofqual_df.iterrows():
    ofqual_data.append({
        'ofqual_id': row['ofqual_id'],
        'unit_id': row['unit_id'],
        'overview': row['overview'],
        'unit_title': row['unit_title'],
        'unit_description': row['unit_description'],
        'unit_learning_outcomes': row['unit_learning_outcomes'],
        'qualification_type': row['qualification_type'],
        'qualification_level': row['qualification_level'],
        'assessment_methods': row['assessment_methods'],
        'sector_subject_area': row['sector_subject_area'],
        'awarding_organisation': row['awarding_organisation'],
        'total_credits': row['total_credits'],
        'guided_learning_hours': row['guided_learning_hours'],
        'total_qualification_time': row['total_qualification_time'],
        'awarding_organization': row['awarding_organization'],
        'embedding': embeddings_ofqual[idx]
    })

In [23]:

BATCH_SIZE = 1000

# Process OFQUAL nodes in batches
for i in tqdm.tqdm(range(0, len(ofqual_data), BATCH_SIZE)):
    batch = ofqual_data[i:i + BATCH_SIZE]
    with db.transaction:
        batch_nodes = OFQUALUnit.create_or_update(*batch)


100%|██████████| 15/15 [01:55<00:00,  7.72s/it]


## Connecting NOS to OFQUAL

In [None]:
# Convert all embeddings to numpy arrays at once
nos_embeddings_array = np.vstack(embeddings_nos)
ofqual_embeddings_array = np.vstack(embeddings_ofqual)

# Calculate all cosine similarities at once
print("Calculating similarities and creating relationships...")
all_similarities = cosine_similarity(nos_embeddings_array, ofqual_embeddings_array)

# Track relationship counts
relationship_count = 0

In [16]:
threshold = 0.0
top_k = 5

In [38]:
# Process similarities for each NOS document and create relationships directly
for nos_idx, similarities in enumerate(all_similarities):
    # Get indices where similarity is above threshold

    above_threshold_indices = np.where(similarities >= threshold)[0] 

    
    if len(above_threshold_indices) > 0:
        # Get the similarity scores for these indices
        scores = similarities[above_threshold_indices]
        
        # If we have more matches than top_k, find the indices of top_k highest scores
        if len(above_threshold_indices) > top_k:
            # Get indices of top_k highest scores
            top_indices = np.argsort(scores)[-top_k:]
            # Get the corresponding ofqual indices and scores
            selected_ofqual_indices = above_threshold_indices[top_indices]
        else:
            # Use all matches
            selected_ofqual_indices = above_threshold_indices
        
        # Get the NOS node
        nos_row  = nos_df.iloc[nos_idx]

        nos_node = NOSNode.nodes.get(nos_id=nos_row['nos_id'])
        print(nos_node)
        # Check if the NOS node already has connections to OFQUAL units
        existing_connections = len(list(nos_node.ofqual_units))

        if existing_connections > 0:
            print(f"Existing connections: {existing_connections}")
            continue
        else:
            print(f"Creating new connections for NOS {nos_row['nos_id']}...")

        # Create relationships directly
        for ofqual_idx in selected_ofqual_indices:
            ofqual_row = ofqual_df.iloc[ofqual_idx]
            ofqual_node = OFQUALUnit.nodes.get(unit_id=ofqual_row['unit_id'])
            
            # Connect the nodes
            nos_node.ofqual_units.connect(ofqual_node)
            relationship_count += 1
            
            # Print progress every 1000 relationships
            if relationship_count % 1000 == 0:
                print(f"Created {relationship_count} relationships so far...")

{'nos_id': 'INSML013', 'industry': 'Management and Leadership', 'title': 'Ensure compliance with legal, regulatory, ethical and social requirements', 'overview': 'This standard focuses on ensuring that organizations comply with legal, regulatory, ethical, and social requirements. It encompasses monitoring compliance, developing supporting policies, and rectifying breaches while encouraging a culture of ethical governance.', 'performance_criteria': '- Monitor legal, regulatory, ethical, and social requirements and their impact on the organization.  \n- Evaluate consequences of non-compliance with these requirements.  \n- Develop policies to ensure compliance.  \n- Ensure colleagues understand and apply organizational policies and procedures.  \n- Monitor policy implementation and provide necessary support.  \n- Facilitate reporting of non-compliance concerns.  \n- Identify risks, hazards, and ethical issues; take corrective action.  \n- Encourage knowledge sharing while maintaining conf

In [18]:
# Create vector indexes
from neomodel import db

print("Creating vector indexes...")
# Create vector index for NOS nodes
db.cypher_query("""
CREATE VECTOR INDEX nos_vector_index IF NOT EXISTS
FOR (n:NOSNode)
ON (n.embedding)
OPTIONS {
    indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'
    }
}
""")

# Create vector index for OFQUAL nodes
# db.cypher_query("""
# CREATE VECTOR INDEX ofqual_vector_index IF NOT EXISTS
# FOR (n:OFQUALUnit)
# ON (n.embedding)
# OPTIONS {
#     indexConfig: {
#         `vector.dimensions`: 1536,
#         `vector.similarity_function`: 'cosine'
#     }
# }
# """)

Creating vector indexes...


([], [])

#### Number of documents ingested in nos_vector_index

In [25]:
db.cypher_query("MATCH (n:NOSNode) WHERE n.embedding IS NOT NULL RETURN COUNT(n) AS indexed_documents;")

([[14157]], ['indexed_documents'])

## Retrieval

In [42]:
from typing import List
from openai import OpenAI
import ast


def get_embedding(text: str, model: str = "text-embedding-3-small") -> List[float]:
    """
    Get the embedding of a text using OpenAI's API.
    """
    client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
    response = client.embeddings.create(
        input=text,
        model=model
    )
    embedding = response.data[0].embedding
    return embedding

def retrieve_ofquals_from_neo4j(nos_id: str) -> List[str]:
    """Get the ofquals mapped to a nos_id"""
    query = """
    MATCH (n:NOSNode {nos_id: $nos_id})-[:MAPS_TO]->(o:OFQUALUnit)
    RETURN o.unit_id AS unit_id, o.unit_title AS unit_title, 
    o.overview AS overview, 
    o.level AS level, 
    o.qualification_type AS qualification_type, 
    o.qualification_level AS qualification_level, 
    o.awarding_organisation AS awarding_organisation, 
    o.total_credits AS total_credits, 
    o.guided_learning_hours AS guided_learning_hours, 
    o.total_qualification_time AS total_qualification_time, 
    o.unit_learning_outcomes AS learning_outcomes, 
    o.assessment_methods AS assessment_methods,
    o.markscheme AS marksscheme
    """
    
    # Execute the query
    results, _ = db.cypher_query(query, {'nos_id': nos_id})
    
    connected_ofqual_units = [{'unit_id': row[0], 'unit_title': row[1], 'overview': row[2], 'level': row[3], 
                        'qualification_type': row[4], 'qualification_level': row[5], 'awarding_organisation': row[6], 
                        'total_credits': row[7], 'guided_learning_hours': row[8], 'total_qualification_time': row[9], 
                        'learning_outcomes': row[10], 'assessment_methods': row[11], 'marksscheme': ast.literal_eval(row[12])} for row in results]
    
    ## If json item is a string, convert it to a json object
    [res.update({'marksscheme': [json.loads(i) for i in ast.literal_eval(res['marksscheme'])]}) for res in connected_ofqual_units]

    return connected_ofqual_units

def retrieve_nos_from_neo4j(query,index_name='nos_vector_index', top_k=5):
    """Retrieve NOS from Neo4j"""
    query_embedding = get_embedding(query)
    cypher_query = f"""
        CALL db.index.vector.queryNodes('{index_name}', $top_k, $query_embedding) 
            YIELD node, score
            RETURN 
                node.nos_id AS nos_id, 
                node.title AS title, 
                node.performance_criteria AS performance_criteria,
                node.knowledge_understanding AS knowledge_understanding,
                score
            ORDER BY score DESC
        """

    result, columns = db.cypher_query(cypher_query, {"query_embedding": query_embedding, "top_k": top_k})
        
    formatted_results = [dict(zip(columns, row)) for row in result]
        
    return formatted_results[:top_k]

In [33]:
query_text = """The Ethics & Compliance function provides assurance that Centrica operates in a manner consistent with its legal and regulatory obligations. 
The Energy Compliance team is responsible for establishing and maintaining a robust compliance framework for energy and ensuring the governance structure within which the framework sits is effective."""

query_text = "Ethics & Compliance professional in the energy sector, with a focus on establishing and maintaining compliance frameworks, regulatory compliance, and governance structures."

# query_embedding = get_batch_openai_embedding([query_text])[0]

In [34]:
results = retrieve_nos_from_neo4j(query_text, index_name='nos_vector_index', top_k=10)

In [35]:
nos_results = [res['nos_id'] for res in results]
nos_results

['CFASAL021',
 'CCSAPLE12',
 'ASTNDEA1',
 'CFAS7.7',
 'ASTACEA1',
 'ASTH401',
 'CFAMAR10',
 'ASTFM324',
 'CLD YW03',
 'CFAPSU023']

In [43]:
ofqual_results = retrieve_ofquals_from_neo4j('ASTFM405')



In [44]:
ofqual_results

[{'unit_id': 'H/618/8072',
  'unit_title': 'Estimate and Agree Costs with Clients',
  'overview': 'ETCAL Level 3 Diploma in Facilities Management, focusing on establishing a robust framework of assessment principles specific to the facilities management sector.',
  'level': None,
  'qualification_type': 'Vocationally-Related Qualification',
  'qualification_level': 'Level 3',
  'awarding_organisation': '/organisations/RN5135',
  'total_credits': 170.0,
  'guided_learning_hours': 960.0,
  'total_qualification_time': 1700.0,
  'learning_outcomes': '1. Understand how to identify and agree the work to be estimated with relevant others.\r\n  - Explain the procedures for identifying and agreeing the work to be estimated with clients\r\n  - Explain how to identify the required resources from technical and legislative data.',
  'assessment_methods': 'Aural Examination, Coursework, E-assessment, Multiple Choice Examination, Oral Examination, Portfolio of Evidence, Practical Demonstration/Assign

In [45]:
## Convert marksscheme to json object
[res.update({'marksscheme': [json.loads(i) for i in ast.literal_eval(res['marksscheme'])]}) for res in ofqual_results]

[None, None, None, None, None]

In [47]:
criterias = ofqual_results[0]['marksscheme']

In [75]:
level='Remember'
if level:
    level_based_marks_scheme = [item for item in criterias if item['bloom_taxonomy_level'] == level][0]
    criteria     = level_based_marks_scheme['criteria']
    expectations = level_based_marks_scheme['expectations']
    task         = level_based_marks_scheme['task']

    benchmarking_responses = level_based_marks_scheme['benchmarking_responses']
    benchmarking_responses = "\n\n".join([f"   **{item['grade'].upper()}:** {item['example']}" for item in benchmarking_responses])
    
    
    ofqual_based_instructions = (
        # f"- **Assessment Area:** {competency_to_assess['assessment_area']}\n\n"
        f"- **Current Bloom's Taxonomy Level mapped to User facing scale is:** {level} (But While addressing about the level, use the level in User facing scale)\n\n"
        f"- **Criteria:** {criteria}\n\n"
        f"- **Task:** {task}\n\n"
        f"- **Expectations:** {expectations}\n\n"
        f"- **Benchmarking Responses for validation:** \n\n{benchmarking_responses}\n\n"
    )

In [76]:

print(ofqual_based_instructions)

- **Current Bloom's Taxonomy Level mapped to User facing scale is:** Remember (But While addressing about the level, use the level in User facing scale)

- **Criteria:** Recall the procedures for identifying and agreeing the work to be estimated with clients.

- **Task:** List the steps involved in identifying and agreeing costs with clients in facility management.

- **Expectations:** Learners must accurately list the steps involved in the estimation process, demonstrating knowledge of key procedures and terminologies without providing detailed explanations.

- **Benchmarking Responses for validation:** 

   **FAIL:** The steps are not clear.

   **PASS:** 1. Meet with the client 2. Discuss costs 3. Finalize the agreement.

   **MERIT:** 1. Meet with the client to understand their needs 2. Discuss the scope of work 3. Identify resources needed 4. Agree on the costs based on discussions.

   **DISTINCTION:** 1. Schedule an initial meeting with the client to discuss their requirements. 