In [1]:
import os
os.chdir('/Users/mumtaz/Documents/projects/zavmo/zavmo-api/zavmo/classification')

In [5]:
from CrossValidation.classifier import *
from CrossValidation.crossval import *
from CrossValidation.evaluation import *

In [18]:
import pandas as pd
import numpy as np
from IPython.display import Markdown
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv(override=True)

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [19]:
def get_batch_list(texts):
    batches = []
    current_batch = []
    current_batch_length = 0
    # Set a conservative max length per batch (characters)
    # OpenAI's text-embedding-3-small has an 8192 token limit
    # A rough estimate is ~4 characters per token, so ~32,000 chars should be safe
    max_batch_length = 30000
    
    for text in texts:
        text_length = len(text)
        
        # If a single text is too long, truncate it
        if text_length > max_batch_length:
            text = text[:max_batch_length]
            text_length = max_batch_length
            
        # If adding this text would exceed the batch limit, start a new batch
        if current_batch_length + text_length > max_batch_length and current_batch:
            batches.append(current_batch)
            current_batch = []
            current_batch_length = 0
            
        # Add the text to the current batch
        current_batch.append(text)
        current_batch_length += text_length
        
    # Add the last batch if it's not empty
    if current_batch:
        batches.append(current_batch)
        
    return batches

def get_batch_openai_embedding(texts: list, model="text-embedding-3-small", **kwargs):
    """
    Get embeddings of a batch of texts from OpenAI API.

    Args:
        texts (list): List of texts to get embeddings for.
        model (str): Model to use for embeddings.
        **kwargs: Additional arguments to pass to the OpenAI API.
    Returns:
        list[list]: List of embeddings of the texts.
    """
    text_batches = get_batch_list(texts)
    print(f"total batches: {len(text_batches)}")
    embeddings = []
    for text_batch in text_batches:
        response = client.embeddings.create(
            model=model,
            input=text_batch,
            **kwargs,
        )
        embeddings += [r.embedding for r in response.data]
    return embeddings

## Ofqual Structured Data with SSAs and sub-SSAs

In [36]:
ofqual = pd.read_csv("/Users/mumtaz/Documents/projects/zavmo/zavmo-api/docs/nos-ofqual/ofqual.csv")

In [37]:
ofqual.head(3)

Unnamed: 0,ofqual_id,overview,id,title,description,learning_outcomes,sub_SSA,SSA
0,601/0405/3,This is an entry-level qualification that enab...,A/001/0001,Constructing halving joints,Learners will learn how to construct halving j...,"[{""description"": ""Understand the principles of...",Building and construction,"Construction, Planning and the Built Environment"
1,601/0405/3,This is an entry-level qualification that enab...,A/001/0002,Constructing housing joints,Learners will learn how to construct housing j...,"[{""description"": ""Understand the principles of...",Building and construction,"Construction, Planning and the Built Environment"
2,601/0405/3,This is an entry-level qualification that enab...,A/001/0003,Painting techniques,Learners will acquire essential painting techn...,"[{""description"": ""Demonstrate key painting tec...",Building and construction,"Construction, Planning and the Built Environment"


In [38]:
# Group by id and get all titles for each id that appears multiple times
duplicate_ids = ofqual[ofqual.duplicated(subset=['id'], keep=False)]
duplicate_ids = duplicate_ids.groupby('id')[['title', 'ofqual_id', 'sub_SSA']].agg(list).head(10)

In [39]:
print(duplicate_ids['ofqual_id'][0])
duplicate_ids['title'][0]

['600/7531/4', '501/0093/2', '600/6880/2', '601/2066/6', '601/2065/4', '600/1624/3', '501/2252/6', '600/1134/8', '600/2221/8', '610/4243/2', '603/4453/2', '603/5681/9', '603/7034/8', '610/2175/1', '610/3586/5', '601/1846/5', '610/3489/7', '603/4248/1']


['Mandatory Core Unit - Exploring Design Ideas',
 'Principles of personal responsibilities and working in a business environment',
 'Working in Engineering',
 'Producing life drawings',
 'Core Unit – Exploring Design Ideas',
 'Customer Payments for Financial Products and Services',
 'Health and Safety in the Workplace',
 'Understand how to organise resources within BSE',
 'Additional Competence Unit',
 'Advanced Nutrition Studies',
 'Assessment of the qualification',
 'Customer Service Practitioner Skills',
 'Management of bleeding and altered levels of consciousness - Additional Details',
 'Unit 1',
 'Unit 1: Title Placeholder',
 'Placeholder Unit 1',
 'Develop and Prepare Resources',
 'Entry Requirements']

In [40]:
# ofqual.drop_duplicates(subset=['id'], inplace=True)
ofqual.shape

(21987, 8)

In [41]:
print("Number of unique OFQUAL IDs: ", len(ofqual['ofqual_id'].unique()))

Number of unique OFQUAL IDs:  2514


In [42]:
print("Number of unique sub-SSAs: ", len(ofqual['sub_SSA'].unique()))

Number of unique sub-SSAs:  42


In [43]:
# Group by ofqual_id and aggregate the text columns
ofqual = ofqual.groupby('ofqual_id').agg({
    'title': lambda x: ', '.join(x.fillna('').astype(str)),
    'description': lambda x: ' '.join(x.fillna('').astype(str)), 
    'overview': 'first',  # Take first overview since it's same for each ofqual_id
    'id': lambda x: list(x),
    'learning_outcomes': lambda x: list(x),
    'sub_SSA': 'first',  # Take first sub_SSA since it should be same for each ofqual_id
    'SSA': 'first'  # Take first SSA since it should be same for each ofqual_id
}).reset_index()

In [44]:
ofqual.head()

Unnamed: 0,ofqual_id,title,description,overview,id,learning_outcomes,sub_SSA,SSA
0,500/1411/0,"Aspects of Crime, Criminal Justice System, Cri...","A unit focusing on various aspects of crime, e...",The Ascentis Level 2 Certificate in Introducti...,"[T/500/2458, A/500/2459, T/500/2458, A/500/2459]","[[{""description"": ""Understand the various aspe...",Sociology and social policy,Social Sciences
1,500/3682/8,"Principles of Customer Service, Delivering Goo...",This unit covers the fundamental principles of...,This qualification covers the basic elements o...,"[A/500/3682, B/500/3682, C/500/3682, D/500/3682]","[[{""description"": ""Understand the principles o...",Administration,"Business, Administration and Law"
2,500/4033/9,"Introduction to Construction Work, Introductio...",An introduction to the foundational skills and...,The Ascentis Level 1 Certificate in Skills for...,"[D/504/3389, D/504/3120, H/504/3393, J/501/712...","[[{""description"": ""Understand basic concepts o...",Building and construction,"Construction, Planning and the Built Environment"
3,500/4068/6,"Carrying Out operations, Handling and Storing ...",This unit covers the skills and knowledge nece...,The Ascentis Entry 3 Certificate in Skills for...,"[L/501/7127, F/501/7125, D/504/3389, A/501/712...","[[{""description"": ""Demonstrate the ability to ...",Building and construction,"Construction, Planning and the Built Environment"
4,500/4242/7,"Health and Safety for Motor Vehicle Studies, I...",In this mandatory unit learners explore the re...,The SEG Level 1 Award in Motor Vehicle Studies...,"[H/501/7005, A/501/7009, M/501/7010, T/501/701...","[[{""description"": ""Know health and safety proc...",Transportation operations and maintenance,Engineering and Manufacturing Technologies


## IQUAL Based classifier

In [14]:
# from classification.CrossValidation import classifier, crossval, evaluation

In [8]:
LOGISTIC_PARAMS = {
    "Classifier": {
        "model": ["LogisticRegression"],
        "C": [0.01, 0.1, 1, 10, 100],
        "class_weight": ["balanced"]  # Add this parameter
    }
}

RANDOM_FOREST_PARAMS = {
    "Classifier": {
        "model": ["RandomForestClassifier"],
        "n_estimators": [100, 200],
        "max_depth": [5, 10]
    }
}

SGD_PARAMS = {
    "Classifier": {
        "model": ["SGDClassifier"],
        "loss": ["hinge", "log_loss"],  
        "alpha": [0.001, 0.01, 0.1, 1, 10],
    }
}

KNN_PARAMS = {
    "Classifier": {
        "model": ["KNeighborsClassifier"],
        "n_neighbors": [3, 5, 7],  # Example values for number of neighbors
        "weights": ["uniform", "distance"],  # Example values for weights
    }
}

# XGB_PARAMS = {
#     "Classifier": {
#         "model": ["XGBClassifier"],
#         "objective": ["multi:softmax"],
#         "eval_metric": ["mlogloss"],
#         "use_label_encoder": [False],
#         "n_estimators": [100, 300],  # Keep fewer values
#         "max_depth": [4, 8],         # Try shallower vs. deeper trees
#         "learning_rate": [0.1],      # Stick to a reasonable value
#         "subsample": [0.8],          # A single best guess
#         "colsample_bytree": [0.8]    # A single best guess
#     }
# }

SVM_PARAMS = {
    "Classifier": {
        "model": ["SVC"],
        "kernel": ["rbf", "linear"],
        "C": [0.1, 1, 10],
        "probability": [True],
        "class_weight": ["balanced"]  # Add this parameter
    }
}


# Combine vectorizer and classifiers
CLASSIFIER_PARAMS = [
    LOGISTIC_PARAMS, 
    RANDOM_FOREST_PARAMS, 
    SGD_PARAMS, 
    KNN_PARAMS,
    SVM_PARAMS
    # XGB_PARAMS
]

# Convert to iQual cross-validation parameters
CV_SEARCH_PARAMS = [crossval.convert_nested_params(params) for params in CLASSIFIER_PARAMS]

print(CV_SEARCH_PARAMS)

[{'Classifier__model': ['LogisticRegression'], 'Classifier__C': [0.01, 0.1, 1, 10, 100], 'Classifier__class_weight': ['balanced']}, {'Classifier__model': ['RandomForestClassifier'], 'Classifier__n_estimators': [100, 200], 'Classifier__max_depth': [5, 10]}, {'Classifier__model': ['SGDClassifier'], 'Classifier__loss': ['hinge', 'log_loss'], 'Classifier__alpha': [0.001, 0.01, 0.1, 1, 10]}, {'Classifier__model': ['KNeighborsClassifier'], 'Classifier__n_neighbors': [3, 5, 7], 'Classifier__weights': ['uniform', 'distance']}, {'Classifier__model': ['SVC'], 'Classifier__kernel': ['rbf', 'linear'], 'Classifier__C': [0.1, 1, 10], 'Classifier__probability': [True], 'Classifier__class_weight': ['balanced']}]


In [9]:
# Scoring Dict for evaluation
scoring_dict = {
    'f1': get_scorer('f1_macro'),
    'accuracy': get_scorer('accuracy')
}

In [10]:
scoring_dict

{'f1': make_scorer(f1_score, response_method='predict', pos_label=None, average=macro),
 'accuracy': make_scorer(accuracy_score, response_method='predict')}

## Preparing Unbiased Sample Data

In [48]:
print("Showing Bias On Few Sub SSAs\n")
ofqual['sub_SSA'].value_counts()

Showing Bias On Few Sub SSAs



sub_SSA
Health and social care                                    357
Building and construction                                 354
Sport, leisure and recreation                             159
Manufacturing technologies                                142
Foundations for learning and life                         138
Public services                                           132
Business management                                       129
Teaching and lecturing                                    126
Service enterprises                                       118
Hospitality and catering                                   94
Administration                                             93
Animal care and veterinary science                         65
Preparation for work                                       61
Engineering                                                59
Child development and well-being                           58
Digital technology (users)                                 39


## Picks Random 10 samples for each sub-SSA

In [554]:
# Get value counts and limit to 10 per category
value_counts = ofqual['sub_SSA'].value_counts()
max_samples_allowed_for_each_unique_sub_SSA = 10

# Create mask for rows to keep
mask = pd.Series(True, index=ofqual.index)
for category in value_counts.index:
    category_indices = ofqual[ofqual['sub_SSA'] == category].index
    if len(category_indices) > max_samples_allowed_for_each_unique_sub_SSA:
        # Randomly select 30 indices to keep
        indices_to_remove = np.random.choice(category_indices, 
                                           size=len(category_indices)-max_samples_allowed_for_each_unique_sub_SSA, 
                                           replace=False)
        mask[indices_to_remove] = False

# Filter the sample data
sample_data = ofqual[mask]
sample_data = sample_data.reset_index(drop=True)
print("Count of Unique Sub SSAs after limiting to 20 per category\n")
sample_data['sub_SSA'].value_counts()

Count of Unique Sub SSAs after limiting to 20 per category



sub_SSA
Manufacturing technologies                                10
Business management                                       10
Nursing and subjects and vocations allied to medicine     10
Child development and well-being                          10
Direct learning support                                   10
Retailing and wholesaling                                 10
Environmental conservation                                10
Public services                                           10
Teaching and lecturing                                    10
Accounting and finance                                    10
Digital technology (users)                                10
Crafts, creative arts and design                          10
Transportation operations and maintenance                 10
Travel and tourism                                        10
Law and legal services                                    10
Health and social care                                    10
Media and commun

In [49]:
ofqual.shape

(2514, 8)

In [50]:
sample_data.shape

(347, 8)

In [51]:
print(f"Sample data covers {len(sample_data['sub_SSA'].unique())} unique Sub SSAs")

Sample data covers 42 unique Sub SSAs


## Load the sample data if already present

In [20]:
sample_data = pd.read_csv("/Users/mumtaz/Documents/projects/zavmo/sample_ofqual_classification_data.csv")

### Generate embeddings for the sample data

In [21]:
## title + description + overview
sample_text_for_embeddings = [f"{r['title']}\n\n{r['description']}\n\n{r['overview']}" for i,r in sample_data.iterrows()]

In [22]:
Markdown(sample_text_for_embeddings[2])

The creative design process for florists, Diverse flower and plant care, Plan, assemble and evaluate diverse tied floristry designs, Plan, assemble and evaluate diverse floral designs for weddings, Plan, assemble and evaluate diverse floral designs, Plan, assemble and evaluate diverse funeral designs, Working in the floristry industry, Negotiate and agree terms and conditions for the sale of floristry products and services

Mandatory core unit focusing on the creative design process utilized by florists. Focusing on the care and maintenance of a variety of flowers and plants. Skills needed to create and assess various tied floristry designs. Skills specific to creating floral arrangements for weddings. Focus on the creation and evaluation of various floral designs. Skills needed for creating funeral floral arrangements. Insights and skills applicable within the floristry industry. Skills for negotiating sales and service agreements in floristry.

This qualification covers a very wide range of areas related to the skills and knowledge required for working in the Floristry Industry.

In [23]:
X_embeddings = get_batch_openai_embedding(sample_text_for_embeddings, model="text-embedding-3-small", dimensions=768)
X_embeddings = np.array(X_embeddings)

total batches: 19


In [24]:
y = sample_data['sub_SSA']

label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(y)

In [25]:
# Train-test split
X_train, X_val, Y_train, Y_val = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

In [26]:
model = Model()
model.add_classifier(name="RandomForestClassifier") 
# model.add_classifier(name="XGBClassifier", objective="multi:softmax", 
#                                          eval_metric="mlogloss", 
#                                          use_label_encoder=False)
model.compile()

# Run cross-validation
# scores = model.cross_validate_fit(
#     X_train,
#     Y_train,
#     cv_method="GridSearchCV",
#     search_parameters=CV_SEARCH_PARAMS,  # Passing search parameters here
#     scoring="f1_macro",
#     cv_splits=3,
#     refit=True
# )

In [27]:
cv = CrossValidator(
            model_pipe=model.model,
            cv_method="GridSearchCV",
            search_parameters=CV_SEARCH_PARAMS,
            scoring=scoring_dict,
            cv=3,
            refit='f1',
            #n_iter=100,
            n_jobs=-1
        )

In [28]:
cv.fit(X_train, Y_train)



f1




In [29]:
params = model.model.get_params()
print(f"Classifier: {params.get('Classifier__model')}")
print(f"Average F1 score: {cv.get_cv_scores()['avg_test_score']:.3f}")

Classifier: LogisticRegression
Average F1 score: 0.616


In [30]:
cv.get_cv_scores()

{'split0_test_f1': 0.555949605949606,
 'split1_test_f1': 0.6029761904761906,
 'split2_test_f1': 0.6887293992557152,
 'avg_test_score': np.float64(0.6158850652271705)}

In [31]:
import joblib

# Save the best model
best_model = cv.cross_validation.best_estimator_
joblib.dump(best_model, "/Users/mumtaz/Documents/projects/zavmo/zavmo-api/zavmo/classification/models/SSA_classification_model.pkl")

['/Users/mumtaz/Documents/projects/zavmo/zavmo-api/zavmo/classification/models/SSA_classification_model.pkl']

In [32]:
# Load the saved model
loaded_model = joblib.load("/Users/mumtaz/Documents/projects/zavmo/zavmo-api/zavmo/classification/models/SSA_classification_model.pkl")

# Predicting on test data
predictions = loaded_model.predict(X_val[5].reshape(1, -1))
predictions



array(['Retailing and wholesaling'], dtype=object)

In [33]:
Y_val.to_list()[5]

'Retailing and wholesaling'

In [35]:
from sklearn.metrics import accuracy_score, classification_report

In [36]:
val_predictions = loaded_model.predict(X_val)

accuracy = accuracy_score(Y_val, val_predictions)
print(f"Validation Accuracy: {accuracy:.3f}")

Validation Accuracy: 0.886




In [472]:
# sample_data.to_csv("/Users/mumtaz/Documents/projects/zavmo/ofqual_sample_data_42_unique_sub_SSA.csv", index=False) 

In [76]:
cv.param_df.sort_values(by='param_Classifier__C', ascending=False).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_Classifier__C,param_Classifier__class_weight,param_Classifier__model,param_Classifier__max_depth,param_Classifier__n_estimators,param_Classifier__alpha,...,split2_test_f1,mean_test_f1,std_test_f1,rank_test_f1,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
2,1.138392,0.071828,0.006319,0.001919,100.0,balanced,LogisticRegression,,,,...,0.926029,0.928068,0.005565,3,0.926537,0.930233,0.924231,0.927,0.002472,3
0,14.645918,0.030716,1.427699,0.161819,10.0,balanced,SVC,,,,...,0.935276,0.933823,0.015673,1,0.925037,0.937734,0.924981,0.929251,0.005999,2
1,26.521944,0.531396,2.076347,0.067503,10.0,balanced,SVC,,,,...,0.939066,0.933621,0.013484,2,0.943778,0.944486,0.943736,0.944,0.000344,1
4,0.708268,0.077907,0.009963,0.004892,10.0,balanced,LogisticRegression,,,,...,0.886978,0.888482,0.004662,5,0.889805,0.874719,0.876219,0.880248,0.006786,10
3,34.449349,2.405184,2.035892,0.035892,1.0,balanced,SVC,,,,...,0.897596,0.889751,0.007487,4,0.874063,0.870218,0.87922,0.8745,0.003688,13
6,24.107807,2.053305,1.628242,0.235413,1.0,balanced,SVC,,,,...,0.882186,0.880721,0.001263,7,0.85907,0.846212,0.857464,0.854249,0.005721,16
11,0.512239,0.085134,0.007018,0.00233,1.0,balanced,LogisticRegression,,,,...,0.820428,0.816262,0.003316,12,0.801349,0.789947,0.814704,0.802,0.010117,18
16,62.971565,1.187377,1.804545,0.101273,0.1,balanced,SVC,,,,...,0.763459,0.768212,0.006789,17,0.727136,0.716429,0.72093,0.721499,0.00439,24
17,76.261541,0.757005,2.372912,0.16932,0.1,balanced,SVC,,,,...,0.742765,0.756836,0.012642,18,0.676162,0.642911,0.611403,0.643492,0.026441,26
20,0.225326,0.034411,0.012022,0.007606,0.1,balanced,LogisticRegression,,,,...,0.606039,0.605303,0.006256,21,0.621439,0.617404,0.633158,0.624001,0.006682,27
