In [1]:
import os
import tqdm
os.chdir("/Users/adityachhabra/Github/zavmo/zavmo-api/zavmo")

In [2]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
pd.set_option('display.max_columns',500)

In [30]:
import pickle
import lzma
import os
from typing import Any

def save_compressed_pickle(data: Any, filepath: str):
    """Save data to a highly compressed pickle file using LZMA compression."""    
    with lzma.open(filepath, 'wb', preset=9) as f:
        pickle.dump(data, f)
    
    print(f"Data successfully saved to highly compressed file {filepath}")


def load_compressed_pickle(filepath: str) -> Any:
    """Load data from a highly compressed pickle file using LZMA decompression."""    
    with lzma.open(filepath, 'rb') as f:
        data = pickle.load(f)
    
    return data

In [3]:
from helpers.chat import get_batch_openai_embedding

### Nos Prep

In [4]:
nos_df = pd.read_csv("../docs/rgcn/nos.csv")
print(nos_df.shape)

(14157, 8)


### Nos Embedding

In [10]:
nos_texts = [f"{row['title']} {row['overview']} {row['performance_criteria']} {row['knowledge_understanding']} {row['keywords']} {row['relevant_roles']}" for idx, row in nos_df.iterrows()]

embeddings_nos = get_batch_openai_embedding(nos_texts)

### Ofqual Prep

In [11]:
ofqual_df = pd.read_csv(r"../agents-markscheme/ofqual_markscheme.csv.gz")
print(f"Ofqual Units: {ofqual_df.unit_id.nunique()}")
print(f"Ofqual IDs: {ofqual_df.ofqual_id.nunique()}")

Ofqual Units: 17690
Ofqual IDs: 3151


In [12]:
len(ofqual_df['unit_id'].unique())

17690

In [13]:
ofqual_df.shape

(24034, 17)

### Ofqual Embedding

In [14]:
ofqual_texts = [f"{row['sector_subject_area']} {row['overview']} {row['unit_title']} {row['unit_description']} {row['unit_learning_outcomes']} {row['qualification_level']} " for idx, row in ofqual_df.iterrows()]

embeddings_ofqual = get_batch_openai_embedding(ofqual_texts)

### Connect to gdb

In [6]:
# Configure neomodel
from neomodel import config, db
DATABASE_URL = f'bolt://{os.getenv("NEO4J_USERNAME")}:{os.getenv("NEO4J_PASSWORD")}@{os.getenv("NEO4J_URI")}'
# DATABASE_URL = "bolt://neo4j:secretgraph@localhost:7687"
config.DATABASE_URL = DATABASE_URL

In [7]:
DATABASE_URL

'bolt://neo4j:zavmoadmin@51.20.45.38:7687'

In [8]:
from neomodel import db
print(db.url) 

None


### Delete all the nodes present

In [15]:
# Clear all nodes and relationships
db.cypher_query("MATCH (n) DETACH DELETE n")

print("All nodes and relationships have been deleted from the database.")

All nodes and relationships have been deleted from the database.


### Delete Ofqual nodes present

In [16]:
db.cypher_query("MATCH (n:OFQUALUnit) DETACH DELETE n")

print("deleted ofqual nodes from the database.")

deleted ofqual nodes from the database.


### Get top-10 Ofqual Units with Cosine Sim >= t

In [17]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
from neomodel import (
    StructuredNode, 
    StringProperty, 
    FloatProperty,
    RelationshipTo,
    RelationshipFrom,
    ArrayProperty,
    JSONProperty
)

class NOSNode(StructuredNode):
    nos_id   = StringProperty(unique_index=True, required=True)    
    industry = StringProperty()
    title = StringProperty()
    overview = StringProperty()
    performance_criteria = StringProperty()
    knowledge_understanding = StringProperty()
    keywords = StringProperty()
    relevant_roles = StringProperty()
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    ofqual_units = RelationshipTo('OFQUALUnit', 'MAPS_TO')

class OFQUALUnit(StructuredNode):
    ofqual_id = StringProperty(index=True)
    unit_id = StringProperty(unique_index=True, required=True)
    
    overview = StringProperty()
    unit_title = StringProperty()
    unit_description = StringProperty()
    unit_learning_outcomes = StringProperty()
    qualification_type = StringProperty()
    qualification_level = StringProperty()
    assessment_methods = StringProperty()
    sector_subject_area = StringProperty()
    awarding_organisation = StringProperty()
    total_credits = FloatProperty()
    guided_learning_hours = FloatProperty()
    total_qualification_time = FloatProperty()
    awarding_organization = StringProperty()
    markscheme = JSONProperty()
    unit_uid = StringProperty()
    
    embedding = ArrayProperty(FloatProperty(), required=True)  # Add embedding vector
    
    nos_items = RelationshipFrom('NOSNode', 'MAPS_TO')

In [35]:
import beepy
beepy.beep()

# Create NOS nodes with embeddings

In [31]:
# Prepare the data for batch creation
nos_data = []
for idx, row in nos_df.iterrows():
    nos_data.append({
        'nos_id': row['nos_id'],
        'industry': row['industry'],
        'title': row['title'],
        'overview': row['overview'],
        'performance_criteria': row['performance_criteria'],
        'knowledge_understanding': row['knowledge_understanding'],
        'keywords': row['keywords'],
        'relevant_roles': row['relevant_roles'],
        'embedding': embeddings_nos[idx]
        })


# Save the data to a pickle file
save_compressed_pickle(nos_data, '../docs/rgcn/nos_data.pkl.xz')

Data successfully saved to highly compressed file ../docs/rgcn/nos_data.pkl.xz


# Create OFQUAL nodes with embeddings

In [34]:
# Similarly for OFQUAL nodes
ofqual_data = []
for idx, row in ofqual_df.iterrows():
    ofqual_data.append({
        'ofqual_id': row['ofqual_id'],
        'unit_id': row['unit_id'],
        'overview': row['overview'],
        'unit_title': row['unit_title'],
        'unit_description': row['unit_description'],
        'unit_learning_outcomes': row['unit_learning_outcomes'],
        'qualification_type': row['qualification_type'],
        'qualification_level': row['qualification_level'],
        'assessment_methods': row['assessment_methods'],
        'sector_subject_area': row['sector_subject_area'],
        'awarding_organisation': row['awarding_organisation'],
        'total_credits': row['total_credits'],
        'guided_learning_hours': row['guided_learning_hours'],
        'total_qualification_time': row['total_qualification_time'],
        'awarding_organization': row['awarding_organization'],
        'markscheme': row['markscheme'],
        'unit_uid': row['unit_uid'],
        'embedding': embeddings_ofqual[idx]
    })
save_compressed_pickle(ofqual_data, '../docs/rgcn/ofqual_data.pkl.xz')

Data successfully saved to highly compressed file ../docs/rgcn/ofqual_data.pkl.xz
