In [1]:
import os
import tqdm
os.chdir("/Users/adityachhabra/Github/zavmo/zavmo-api/zavmo")

In [2]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
pd.set_option('display.max_columns',500)

In [3]:
import pickle
import lzma
import os
from typing import Any

def save_compressed_pickle(data: Any, filepath: str):
    """Save data to a highly compressed pickle file using LZMA compression."""    
    with lzma.open(filepath, 'wb', preset=9) as f:
        pickle.dump(data, f)
    
    print(f"Data successfully saved to highly compressed file {filepath}")


def load_compressed_pickle(filepath: str) -> Any:
    """Load data from a highly compressed pickle file using LZMA decompression."""    
    with lzma.open(filepath, 'rb') as f:
        data = pickle.load(f)
    
    return data

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

from neomodel import (
    StructuredNode, 
    StringProperty, 
    FloatProperty,
    RelationshipTo,
    RelationshipFrom,
    ArrayProperty,
    JSONProperty
)

class NOSNode(StructuredNode):
    nos_id   = StringProperty(unique_index=True, required=True)    
    industry = StringProperty()
    title = StringProperty()
    overview = StringProperty()
    performance_criteria = StringProperty()
    knowledge_understanding = StringProperty()
    keywords = StringProperty()
    relevant_roles = StringProperty()
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    ofqual_units = RelationshipTo('OFQUALUnit', 'MAPS_TO')

class OFQUALUnit(StructuredNode):
    ofqual_id = StringProperty(index=True)
    unit_id = StringProperty(index=True, required=True)  # Not unique anymore
    unit_uid = StringProperty(unique_index=True, required=True)  # Make this the unique identifier
    
    overview = StringProperty()
    unit_title = StringProperty()
    unit_description = StringProperty()
    unit_learning_outcomes = StringProperty()
    qualification_type = StringProperty()
    qualification_level = StringProperty()
    assessment_methods = StringProperty()
    sector_subject_area = StringProperty()
    awarding_organisation = StringProperty()
    total_credits = FloatProperty()
    guided_learning_hours = FloatProperty()
    total_qualification_time = FloatProperty()
    awarding_organization = StringProperty()
    markscheme = JSONProperty()
    
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    nos_items = RelationshipFrom('NOSNode', 'MAPS_TO')

### Ofqual Prep

In [4]:
nos_data = load_compressed_pickle('../docs/rgcn/nos_data.pkl.xz')

In [5]:
ofqual_data = load_compressed_pickle('../docs/rgcn/ofqual_data.pkl.xz')

### Connect to gdb

In [6]:
# Configure neomodel
from neomodel import config, db
DATABASE_URL = f'bolt://{os.getenv("NEO4J_USERNAME")}:{os.getenv("NEO4J_PASSWORD")}@{os.getenv("NEO4J_URI")}'
# DATABASE_URL = "bolt://neo4j:secretgraph@localhost:7687"
config.DATABASE_URL = DATABASE_URL

## Connecting NOS to OFQUAL

In [None]:
nos_embeddings_array = np.vstack([x['embedding'] for x in nos_data])

ofqual_embeddings_array = np.vstack([x['embedding'] for x in ofqual_data])

In [11]:
# Convert all embeddings to numpy arrays at once
print("Calculating similarities and creating relationships...")
all_similarities = cosine_similarity(nos_embeddings_array, ofqual_embeddings_array)

Calculating similarities and creating relationships...


In [15]:
# Create the dictionary mapping NOS IDs to their top OFQUAL matches
top_k = 5
similarity_threshold = 0.0  # Adjust as needed
nos_to_ofqual_map = {}

for nos_idx, nos_item in enumerate(nos_data):
    nos_id = nos_item['nos_id']
    
    # Get indices where similarity exceeds threshold
    threshold_mask = all_similarities[nos_idx] > similarity_threshold
    eligible_indices = np.where(threshold_mask)[0]
    
    # If there are eligible matches, get the top k among them
    if len(eligible_indices) > 0:
        # Sort eligible indices by similarity (descending)
        sorted_eligible = eligible_indices[np.argsort(-all_similarities[nos_idx][eligible_indices])]
        # Take top k
        top_indices = sorted_eligible[:top_k]
        
        # Store matches
        matches = [ofqual_data[idx]['unit_uid'] for idx in top_indices]
        
        if matches:
            nos_to_ofqual_map[nos_id] = matches

print(f"Found matches for {len(nos_to_ofqual_map)} NOS nodes")

Found matches for 14157 NOS nodes


In [26]:
# Prepare parameters for Cypher query
params_list = []
for nos_id, ofqual_units in nos_to_ofqual_map.items():
    for unit_uid in ofqual_units:
        params_list.append({
            'nos_id': nos_id,
            'unit_uid': unit_uid
        })

print(f"List of relationships: {len(params_list)}")

List of relationships: 70785


### Map NOS - OFQUAL

In [27]:
query = """
UNWIND $params AS param
MATCH (n:NOSNode {nos_id: param.nos_id})
MATCH (o:OFQUALUnit {unit_uid: param.unit_uid})
MERGE (n)-[r:MAPS_TO]->(o)
"""

# Execute in batches if there are many relationships
batch_size = 1000
total_created = 0

# Calculate total number of batches for tqdm
total_batches = (len(params_list) - 1) // batch_size + 1

# Use tqdm for the batch processing loop
for i in tqdm.tqdm(range(0, len(params_list), batch_size), total=total_batches, desc="Attaching relationships"):
    batch = params_list[i:i + batch_size]
    
    with db.transaction:
        db.cypher_query(query, {'params': batch})
        total_created += len(batch)
        
print(f"Created {total_created} relationships between NOS and OFQUAL units")

Attaching relationships: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71/71 [00:58<00:00,  1.21it/s]

Created 70785 relationships between NOS and OFQUAL units



