In [2]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
pd.set_option('display.max_columns',500)

In [10]:
from helpers.chat import get_batch_openai_embedding

### Nos Prep

In [None]:
nos_df = pd.read_csv("../docs/rgcn/nos.csv")

nos_texts = [f"{row['title']} {row['overview']} {row['performance_criteria']} {row['knowledge_understanding']} {row['keywords']} {row['relevant_roles']}" for idx, row in nos_df.iterrows()]

embeddings_nos = get_batch_openai_embedding(nos_texts)

### Ofqual Prep

In [None]:
ofqual_df = pd.read_csv("../docs/rgcn/ofqual_units.csv")

ofqual_texts = [f"{row['sector_subject_area']} {row['overview']} {row['unit_title']} {row['unit_description']} {row['unit_learning_outcomes']} {row['qualification_level']}" for idx, row in ofqual_df.iterrows()]

embeddings_ofqual = get_batch_openai_embedding(ofqual_texts)

### Get top-10 Ofqual Units with Cosine Sim >= t

In [30]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
nos_row

nos_id                                                               COGSS13
industry                              Safety Services Oil and Gas Extraction
title                      Contribute to the health and safety of the wor...
overview                   This NOS focuses on the individual's role in c...
performance_criteria       - Correctly select and use relevant Personal P...
knowledge_understanding    - Obtain and interpret safety information.  \n...
keywords                   contribute, health, safety, monitor, pollution...
relevant_roles             - Associate Professionals and Technical Occupa...
Name: 1780, dtype: object

In [40]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time


In [41]:
# Set a similarity threshold
threshold = 0.7  # Adjust this based on your needs
# Set a top_k parameter to limit the maximum matches per NOS document
top_k = 5  # Adjust this based on your needs

In [48]:
# Start timer
start_time = time.time()

# Convert all embeddings to numpy arrays at once
nos_embeddings_array = np.vstack(embeddings_nos)
ofqual_embeddings_array = np.vstack(embeddings_ofqual)

# Calculate all cosine similarities at once
# This is much faster than doing it document by document
all_similarities = cosine_similarity(nos_embeddings_array, ofqual_embeddings_array)

# Create empty list to store results
similarity_results = []

# Process similarities for each NOS document
for nos_idx, similarities in enumerate(all_similarities):
    # Get indices where similarity is above threshold
    above_threshold_indices = np.where(similarities >= threshold)[0]
    
    if len(above_threshold_indices) > 0:
        # Get the similarity scores for these indices
        scores = similarities[above_threshold_indices]
        
        # If we have more matches than top_k, find the indices of top_k highest scores
        if len(above_threshold_indices) > top_k:
            # Get indices of top_k highest scores
            top_indices = np.argsort(scores)[-top_k:]
            # Get the corresponding ofqual indices and scores
            selected_ofqual_indices = above_threshold_indices[top_indices]
            selected_scores = scores[top_indices]
        else:
            # Use all matches
            selected_ofqual_indices = above_threshold_indices
            selected_scores = scores
        
        # Add matches to results
        nos_row = nos_df.iloc[nos_idx]
        for i, ofqual_idx in enumerate(selected_ofqual_indices):
            ofqual_row = ofqual_df.iloc[ofqual_idx]
            similarity_results.append({
                **nos_row,**ofqual_row,'similarity_score': float(selected_scores[i]),
            })

# Create final DataFrame
results_df = pd.DataFrame(similarity_results)
# Sort by similarity score
results_df = results_df.sort_values('similarity_score', ascending=False, ignore_index=True)
# Print summary statistics
print(f"Total number of NOS documents: {len(nos_df)}")
print(f"Total number of matches: {len(results_df)}")
print(f"Average number of matches per NOS document: {len(results_df)/len(nos_df):.2f}")
print(f"Processing time: {time.time() - start_time:.2f} seconds")

Total number of NOS documents: 14157
Total number of matches: 262
Average number of matches per NOS document: 0.02
Processing time: 1.35 seconds


In [50]:
# Save to CSV for Neo4j import
results_df.to_csv('../docs/rgcn/nos_ofqual_relationships.csv', index=False)

### `NOTE`: Ingest these with NOS embeddings - for vector search

### Wait for IP to be whitelisted

In [None]:
from neo4j import GraphDatabase
driver = GraphDatabase.driver(os.getenv("NEO4J_URI"), auth=(os.getenv("NEO4J_USERNAME"),os.getenv("NEO4J_PASSWORD")))

In [9]:
driver.get_server_info()

KeyboardInterrupt: 

In [7]:
driver.verify_connectivity()

KeyboardInterrupt: 