In [1]:
import os
import tqdm
os.chdir("/Users/adityachhabra/Github/zavmo/zavmo-api/zavmo")

In [2]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
pd.set_option('display.max_columns',500)

In [3]:
import pickle
import lzma
import os
from typing import Any

def save_compressed_pickle(data: Any, filepath: str):
    """Save data to a highly compressed pickle file using LZMA compression."""    
    with lzma.open(filepath, 'wb', preset=9) as f:
        pickle.dump(data, f)
    
    print(f"Data successfully saved to highly compressed file {filepath}")


def load_compressed_pickle(filepath: str) -> Any:
    """Load data from a highly compressed pickle file using LZMA decompression."""    
    with lzma.open(filepath, 'rb') as f:
        data = pickle.load(f)
    
    return data

### Ofqual Prep

In [4]:
ofqual_data = load_compressed_pickle('../docs/rgcn/ofqual_data.pkl.xz')

### Connect to gdb

In [5]:
# Configure neomodel
from neomodel import config, db
DATABASE_URL = f'bolt://{os.getenv("NEO4J_USERNAME")}:{os.getenv("NEO4J_PASSWORD")}@{os.getenv("NEO4J_URI")}'
# DATABASE_URL = "bolt://neo4j:secretgraph@localhost:7687"
config.DATABASE_URL = DATABASE_URL

# Ingestion

In [6]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
from neomodel import (
    StructuredNode, 
    StringProperty, 
    FloatProperty,
    RelationshipTo,
    RelationshipFrom,
    ArrayProperty,
    JSONProperty
)

class NOSNode(StructuredNode):
    nos_id   = StringProperty(unique_index=True, required=True)    
    industry = StringProperty()
    title = StringProperty()
    overview = StringProperty()
    performance_criteria = StringProperty()
    knowledge_understanding = StringProperty()
    keywords = StringProperty()
    relevant_roles = StringProperty()
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    ofqual_units = RelationshipTo('OFQUALUnit', 'MAPS_TO')

class OFQUALUnit(StructuredNode):
    ofqual_id = StringProperty(index=True)
    unit_id = StringProperty(index=True, required=True)  # Not unique anymore
    unit_uid = StringProperty(unique_index=True, required=True)  # Make this the unique identifier
    
    overview = StringProperty()
    unit_title = StringProperty()
    unit_description = StringProperty()
    unit_learning_outcomes = StringProperty()
    qualification_type = StringProperty()
    qualification_level = StringProperty()
    assessment_methods = StringProperty()
    sector_subject_area = StringProperty()
    awarding_organisation = StringProperty()
    total_credits = FloatProperty()
    guided_learning_hours = FloatProperty()
    total_qualification_time = FloatProperty()
    awarding_organization = StringProperty()
    markscheme = JSONProperty()
    
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    nos_items = RelationshipFrom('NOSNode', 'MAPS_TO')

### OfQual (Unique Index)

In [8]:
# Create a unique constraint on unit_uid
db.cypher_query("CREATE CONSTRAINT IF NOT EXISTS FOR (n:OFQUALUnit) REQUIRE n.unit_uid IS UNIQUE")

# Process in larger batches using Cypher directly but with neomodel's db interface
BATCH_SIZE = 100

# Process in batches
for i in tqdm.tqdm(range(0, len(ofqual_data), BATCH_SIZE)):
    batch = ofqual_data[i:i + BATCH_SIZE]
    
    # Prepare parameters for Cypher
    params_list = []
    for item in batch:
        # Ensure unit_uid exists
        if 'unit_uid' not in item or not item['unit_uid']:
            # Generate a unit_uid if it doesn't exist
            item['unit_uid'] = f"{item.get('ofqual_id', '')}-{item['unit_id']}"
            
        params = {
            'unit_uid': item['unit_uid'],
            'ofqual_id': item.get('ofqual_id', ''),
            'unit_id': item['unit_id'],
            'overview': item.get('overview', ''),
            'unit_title': item.get('unit_title', ''),
            'unit_description': item.get('unit_description', ''),
            'unit_learning_outcomes': item.get('unit_learning_outcomes', ''),
            'qualification_type': item.get('qualification_type', ''),
            'qualification_level': item.get('qualification_level', ''),
            'assessment_methods': item.get('assessment_methods', ''),
            'sector_subject_area': item.get('sector_subject_area', ''),
            'awarding_organisation': item.get('awarding_organisation', ''),
            'total_credits': item.get('total_credits', 0.0),
            'guided_learning_hours': item.get('guided_learning_hours', 0.0),
            'total_qualification_time': item.get('total_qualification_time', 0.0),
            'awarding_organization': item.get('awarding_organization', ''),
            'markscheme': item.get('markscheme', {}),
            'embedding': item['embedding']
        }
        params_list.append(params)
    
    # Build Cypher query for merging nodes using unit_uid as the unique key
    query = """
    UNWIND $params AS param
    MERGE (n:OFQUALUnit {unit_uid: param.unit_uid})
    ON CREATE SET 
        n.ofqual_id = param.ofqual_id,
        n.unit_id = param.unit_id,
        n.overview = param.overview,
        n.unit_title = param.unit_title,
        n.unit_description = param.unit_description,
        n.unit_learning_outcomes = param.unit_learning_outcomes,
        n.qualification_type = param.qualification_type,
        n.qualification_level = param.qualification_level,
        n.assessment_methods = param.assessment_methods,
        n.sector_subject_area = param.sector_subject_area,
        n.awarding_organisation = param.awarding_organisation,
        n.total_credits = param.total_credits,
        n.guided_learning_hours = param.guided_learning_hours,
        n.total_qualification_time = param.total_qualification_time,
        n.awarding_organization = param.awarding_organization,
        n.markscheme = param.markscheme,
        n.embedding = param.embedding
    ON MATCH SET 
        n.ofqual_id = param.ofqual_id,
        n.unit_id = param.unit_id,
        n.overview = param.overview,
        n.unit_title = param.unit_title,
        n.unit_description = param.unit_description,
        n.unit_learning_outcomes = param.unit_learning_outcomes,
        n.qualification_type = param.qualification_type,
        n.qualification_level = param.qualification_level,
        n.assessment_methods = param.assessment_methods,
        n.sector_subject_area = param.sector_subject_area,
        n.awarding_organisation = param.awarding_organisation,
        n.total_credits = param.total_credits,
        n.guided_learning_hours = param.guided_learning_hours,
        n.total_qualification_time = param.total_qualification_time,
        n.awarding_organization = param.awarding_organization,
        n.markscheme = param.markscheme,
        n.embedding = param.embedding
    """
    
    # Execute query with transaction
    with db.transaction:
        db.cypher_query(query, {'params': params_list})

print(f"Total OFQUAL units processed: {len(ofqual_data)}")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 241/241 [21:04<00:00,  5.25s/it]

Total OFQUAL units processed: 24034



