In [1]:
import os
import tqdm
os.chdir("/Users/adityachhabra/Github/zavmo/zavmo-api/zavmo")

In [2]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
pd.set_option('display.max_columns',500)

In [3]:
import pickle
import lzma
import os
from typing import Any

def save_compressed_pickle(data: Any, filepath: str):
    """Save data to a highly compressed pickle file using LZMA compression."""    
    with lzma.open(filepath, 'wb', preset=9) as f:
        pickle.dump(data, f)
    
    print(f"Data successfully saved to highly compressed file {filepath}")


def load_compressed_pickle(filepath: str) -> Any:
    """Load data from a highly compressed pickle file using LZMA decompression."""    
    with lzma.open(filepath, 'rb') as f:
        data = pickle.load(f)
    
    return data

### Nos Data

In [4]:
nos_data = load_compressed_pickle('../docs/rgcn/nos_data.pkl.xz')

### Connect to gdb

In [6]:
# Configure neomodel
from neomodel import config, db
DATABASE_URL = f'bolt://{os.getenv("NEO4J_USERNAME")}:{os.getenv("NEO4J_PASSWORD")}@{os.getenv("NEO4J_URI")}'
# DATABASE_URL = "bolt://neo4j:secretgraph@localhost:7687"
config.DATABASE_URL = DATABASE_URL

### Delete all the nodes present

In [7]:
# Clear all nodes and relationships
db.cypher_query("MATCH (n) DETACH DELETE n")

print("All nodes and relationships have been deleted from the database.")

All nodes and relationships have been deleted from the database.


### Delete Ofqual nodes present

In [8]:
db.cypher_query("MATCH (n:OFQUALUnit) DETACH DELETE n")

print("deleted ofqual nodes from the database.")

deleted ofqual nodes from the database.


# Ingestion

In [9]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
from neomodel import (
    StructuredNode, 
    StringProperty, 
    FloatProperty,
    RelationshipTo,
    RelationshipFrom,
    ArrayProperty,
    JSONProperty
)

class NOSNode(StructuredNode):
    nos_id   = StringProperty(unique_index=True, required=True)    
    industry = StringProperty()
    title = StringProperty()
    overview = StringProperty()
    performance_criteria = StringProperty()
    knowledge_understanding = StringProperty()
    keywords = StringProperty()
    relevant_roles = StringProperty()
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    ofqual_units = RelationshipTo('OFQUALUnit', 'MAPS_TO')

class OFQUALUnit(StructuredNode):
    ofqual_id = StringProperty(index=True)
    unit_id = StringProperty(index=True, required=True)  # Not unique anymore
    unit_uid = StringProperty(unique_index=True, required=True)  # Make this the unique identifier
    
    overview = StringProperty()
    unit_title = StringProperty()
    unit_description = StringProperty()
    unit_learning_outcomes = StringProperty()
    qualification_type = StringProperty()
    qualification_level = StringProperty()
    assessment_methods = StringProperty()
    sector_subject_area = StringProperty()
    awarding_organisation = StringProperty()
    total_credits = FloatProperty()
    guided_learning_hours = FloatProperty()
    total_qualification_time = FloatProperty()
    awarding_organization = StringProperty()
    markscheme = JSONProperty()
    
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    nos_items = RelationshipFrom('NOSNode', 'MAPS_TO')

### NOS (Unique Index)

In [11]:
# For better Cypher performance, add index first (if not already present)
db.cypher_query("CREATE CONSTRAINT IF NOT EXISTS FOR (n:NOSNode) REQUIRE n.nos_id IS UNIQUE")

([], [])

In [12]:
# Process in larger batches using Cypher directly but with neomodel's db interface
BATCH_SIZE = 100

# Process in batches
for i in tqdm.tqdm(range(0, len(nos_data), BATCH_SIZE)):
    batch = nos_data[i:i + BATCH_SIZE]
    
    # Prepare parameters for Cypher
    params_list = []
    for item in batch:
        params = {
            'nos_id': item['nos_id'],
            'industry': item.get('industry', ''),
            'title': item.get('title', ''),
            'overview': item.get('overview', ''),
            'performance_criteria': item.get('performance_criteria', ''),
            'knowledge_understanding': item.get('knowledge_understanding', ''),
            'keywords': item.get('keywords', ''),
            'relevant_roles': item.get('relevant_roles', ''),
            'embedding': item['embedding']
        }
        params_list.append(params)
    
    # Build Cypher query for merging nodes
    query = """
    UNWIND $params AS param
    MERGE (n:NOSNode {nos_id: param.nos_id})
    ON CREATE SET 
        n.industry = param.industry,
        n.title = param.title,
        n.overview = param.overview,
        n.performance_criteria = param.performance_criteria,
        n.knowledge_understanding = param.knowledge_understanding,
        n.keywords = param.keywords,
        n.relevant_roles = param.relevant_roles,
        n.embedding = param.embedding
    ON MATCH SET 
        n.industry = param.industry,
        n.title = param.title,
        n.overview = param.overview,
        n.performance_criteria = param.performance_criteria,
        n.knowledge_understanding = param.knowledge_understanding,
        n.keywords = param.keywords,
        n.relevant_roles = param.relevant_roles,
        n.embedding = param.embedding
    """
    
    # Execute query with transaction
    with db.transaction:
        db.cypher_query(query, {'params': params_list})

print(f"Total nodes processed: {len(nos_data)}")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 142/142 [08:46<00:00,  3.71s/it]

Total nodes processed: 14157



