In [1]:
import os
os.chdir('/Users/adityachhabra/Github/zavmo/zavmo-api/zavmo')

In [2]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
pd.set_option('display.max_columns',500)

In [3]:
from helpers.chat import get_batch_openai_embedding

### Nos Prep

In [4]:
nos_df = pd.read_csv("../docs/rgcn/nos.csv")
print(nos_df.shape)

nos_texts = [f"{row['title']} {row['overview']} {row['performance_criteria']} {row['knowledge_understanding']} {row['keywords']} {row['relevant_roles']}" for idx, row in nos_df.iterrows()]

embeddings_nos = get_batch_openai_embedding(nos_texts)

(14157, 8)


### Ofqual Prep

In [5]:
ofqual_df = pd.read_csv("../docs/rgcn/ofqual_units.csv")
print(f"Ofqual Units: {ofqual_df.unit_id.nunique()}")
print(f"Ofqual IDs: {ofqual_df.ofqual_id.nunique()}")

ofqual_texts = [f"{row['sector_subject_area']} {row['overview']} {row['unit_title']} {row['unit_description']} {row['unit_learning_outcomes']} {row['qualification_level']}" for idx, row in ofqual_df.iterrows()]

embeddings_ofqual = get_batch_openai_embedding(ofqual_texts)

Ofqual Units: 14342
Ofqual IDs: 2647


### Get top-10 Ofqual Units with Cosine Sim >= t

In [6]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time

In [8]:
from neomodel import (
    StructuredNode, 
    StringProperty, 
    FloatProperty,
    RelationshipTo,
    RelationshipFrom,
    ArrayProperty
)

class NOSNode(StructuredNode):
    nos_id   = StringProperty(unique_index=True, required=True)    
    industry = StringProperty()
    title = StringProperty()
    overview = StringProperty()
    performance_criteria = StringProperty()
    knowledge_understanding = StringProperty()
    keywords = StringProperty()
    relevant_roles = StringProperty()
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    ofqual_units = RelationshipTo('OFQUALUnit', 'MAPS_TO')

class OFQUALUnit(StructuredNode):
    ofqual_id = StringProperty(index=True)
    unit_id = StringProperty(unique_index=True, required=True)
    
    overview = StringProperty()
    unit_title = StringProperty()
    unit_description = StringProperty()
    unit_learning_outcomes = StringProperty()
    qualification_type = StringProperty()
    qualification_level = StringProperty()
    assessment_methods = StringProperty()
    sector_subject_area = StringProperty()
    awarding_organisation = StringProperty()
    total_credits = FloatProperty()
    guided_learning_hours = FloatProperty()
    total_qualification_time = FloatProperty()
    awarding_organization = StringProperty()
    
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    nos_items = RelationshipFrom('NOSNode', 'MAPS_TO')

In [11]:
# Configure neomodel
from neomodel import config
#config.DATABASE_URL = f'bolt://{os.getenv("NEO4J_USERNAME")}:zavmoadmin@51.20.45.38:7687'
config.DATABASE_URL = f'bolt://{os.getenv("NEO4J_USERNAME")}:{os.getenv("NEO4J_PASSWORD")}@{os.getenv("NEO4J_URI")}'

In [13]:
from neomodel import db
print(db.url)  # Should print "bolt://neo4j:password@localhost:7687"

None


# Create NOS nodes with embeddings

In [None]:
# Prepare the data for batch creation
nos_data = []
for idx, row in nos_df.iterrows():
    nos_data.append({
        'nos_id': row['nos_id'],
        'industry': row['industry'],
        'title': row['title'],
        'overview': row['overview'],
        'performance_criteria': row['performance_criteria'],
        'knowledge_understanding': row['knowledge_understanding'],
        'keywords': row['keywords'],
        'relevant_roles': row['relevant_roles'],
        'embedding': embeddings_nos[idx]
    })

# Create all NOS nodes in a single transaction
with db.transaction:
    nos_nodes = NOSNode.create_or_update(*nos_data)

# Create OFQUAL nodes with embeddings

In [None]:
# Similarly for OFQUAL nodes
ofqual_data = []
for idx, row in ofqual_df.iterrows():
    ofqual_data.append({
        'ofqual_id': row['ofqual_id'],
        'unit_id': row['unit_id'],
        'overview': row['overview'],
        'unit_title': row['unit_title'],
        'unit_description': row['unit_description'],
        'unit_learning_outcomes': row['unit_learning_outcomes'],
        'qualification_type': row['qualification_type'],
        'qualification_level': row['qualification_level'],
        'assessment_methods': row['assessment_methods'],
        'sector_subject_area': row['sector_subject_area'],
        'awarding_organisation': row['awarding_organisation'],
        'total_credits': row['total_credits'],
        'guided_learning_hours': row['guided_learning_hours'],
        'total_qualification_time': row['total_qualification_time'],
        'awarding_organization': row['awarding_organization'],
        'embedding': embeddings_ofqual[idx]
    })

with db.transaction:
    ofqual_nodes = OFQUALUnit.create_or_update(*ofqual_data)

In [None]:
# Convert all embeddings to numpy arrays at once
nos_embeddings_array = np.vstack(embeddings_nos)
ofqual_embeddings_array = np.vstack(embeddings_ofqual)

# Calculate all cosine similarities at once
print("Calculating similarities and creating relationships...")
all_similarities = cosine_similarity(nos_embeddings_array, ofqual_embeddings_array)

# Track relationship counts
relationship_count = 0

# Process similarities for each NOS document and create relationships directly
for nos_idx, similarities in enumerate(all_similarities):
    # Get indices where similarity is above threshold
    above_threshold_indices = np.where(similarities >= threshold)[0]
    
    if len(above_threshold_indices) > 0:
        # Get the similarity scores for these indices
        scores = similarities[above_threshold_indices]
        
        # If we have more matches than top_k, find the indices of top_k highest scores
        if len(above_threshold_indices) > top_k:
            # Get indices of top_k highest scores
            top_indices = np.argsort(scores)[-top_k:]
            # Get the corresponding ofqual indices and scores
            selected_ofqual_indices = above_threshold_indices[top_indices]
        else:
            # Use all matches
            selected_ofqual_indices = above_threshold_indices
        
        # Get the NOS node
        nos_row  = nos_df.iloc[nos_idx]
        nos_node = nos_nodes[nos_row['nos_id']]
        
        # Create relationships directly
        for ofqual_idx in selected_ofqual_indices:
            ofqual_row = ofqual_df.iloc[ofqual_idx]
            ofqual_node = ofqual_nodes[ofqual_row['unit_id']]
            
            # Connect the nodes
            nos_node.ofqual_units.connect(ofqual_node)
            relationship_count += 1
            
            # Print progress every 1000 relationships
            if relationship_count % 1000 == 0:
                print(f"Created {relationship_count} relationships so far...")



In [None]:
# Create vector indexes
from neomodel import db

print("Creating vector indexes...")
# Create vector index for NOS nodes
db.cypher_query("""
CREATE VECTOR INDEX nos_vector_index IF NOT EXISTS
FOR (n:NOSNode)
ON (n.embedding)
OPTIONS {
    indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'
    }
}
""")

# Create vector index for OFQUAL nodes
db.cypher_query("""
CREATE VECTOR INDEX ofqual_vector_index IF NOT EXISTS
FOR (n:OFQUALUnit)
ON (n.embedding)
OPTIONS {
    indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'
    }
}
""")

# Print summary statistics
print(f"Total number of NOS documents: {len(nos_df)}")
print(f"Total number of relationships created: {relationship_count}")
print(f"Average number of relationships per NOS document: {relationship_count/len(nos_df):.2f}")
print(f"Processing time: {time.time() - start_time:.2f} seconds")