## Setup notebook

In [0]:
import os

# Retrieve API keys from environment variables
prodigy_key = os.getenv("MY_PRODIGY_KEY")
if not prodigy_key:
    raise ValueError("Environment variable MY_PRODIGY_KEY is not set.")

In [0]:
# Install library and versions below
%pip install spacy==3.7.5 spacy-transformers==1.3.5 spacy-loggers==1.0.5 mlflow==2.20.1 > /dev/null 2>&1
%pip install prodigy -f https://{prodigy_key}@download.prodi.gy/
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Looking in links: https://****@download.prodi.gy/
Collecting prodigy
  Downloading https://download.prodi.gy/download/prodigy-1.18.0-py3-none-any.whl (994 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 994.0/994.0 kB 1.9 MB/s eta 0:00:00
Collecting typeguard<4.0,>=3.0.2
  Using cached typeguard-3.0.2-py3-none-any.whl (30 kB)
Collecting aiofiles
  Using cached aiofiles-24.1.0-py3-none-any.whl (15 kB)
Collecting peewee<3.17.0,>=3.12.0
  Using cached peewee-3.16.3-cp310-cp310-linux_x86_64.whl
Collecting pydantic<3.0,>=1.10.8
  Using cached pydantic-2.10.6-py3-none-any.whl (431 kB)
Collecting toolz<1.0.0,>=0.8.2
  Using cached toolz-0.12.1-py3-none-any.wh

In [0]:
# Core Utilities
import os
import sys
import logging
import ast
from pathlib import Path
import warnings

# Data Handling
import json
import pandas as pd

# MLflow Components
import mlflow
import mlflow.spacy

# SpaCy / NLP Components
import spacy
from spacy.language import Language
from spacy.training import Example
from spacy.training.initialize import init_nlp
from spacy.training.loop import train
from spacy.util import load_config
from spacy.cli.train import train

## View the annotated data you've annotated and double-check you've been consistent in your annotation (ESPECIALLY IF MULTIPLE ANNOTATORS DID THE WORK). Changes you should make are:
- To change the benefit category applied for an example, use the 'accept' column
- To change whether you want to include the example in training or not, use the 'answer' column. 'accept' -> Include in training. 'reject' -> Exclude from training

In [0]:
# Much easier to make changes in excel. Export as csv, then re-load into this notebook to convert back to jsonl

annotated_data = "/v2_1_fixed.jsonl"
df_annotated_data = pd.read_json(annotated_data, lines=True)
df_annotated_data.to_csv(annotated_data.replace('.jsonl', '.csv'), index=False)

In [0]:
# Load back in as csv and convert back to jsonl

new_annotated_data = "/v2_1_fixed.csv"
df_new_annotated_data = pd.read_csv(new_annotated_data)


def convert_columns(df):
    # Define columns to convert
    dictionary_to_convert = ['meta', 'config']
    array_to_convert = ['accept', 'options']
    
    # Convert specified dictionary columns
    for column in dictionary_to_convert:
        if column in df.columns:
            try:
                df[column] = df[column].apply(
                    lambda x: ast.literal_eval(x) if pd.notnull(x) else x
                )
                print(f"Converted column '{column}' from string to dictionary.")
            except (ValueError, SyntaxError) as e:
                print(f"Error converting column '{column}': {e}")
        else:
            print(f"Warning: Column '{column}' not found in the DataFrame.")
    
    # Convert specified array columns
    for column in array_to_convert:
        if column in df.columns:
            try:
                df[column] = df[column].apply(
                    lambda x: ast.literal_eval(x) if isinstance(x, str) and x.strip().startswith('[') else x
                )
                print(f"Converted column '{column}' from string to list where applicable.")
            except (ValueError, SyntaxError) as e:
                print(f"Error converting column '{column}': {e}")
        else:
            print(f"Warning: Column '{column}' not found in the DataFrame.")
    
    return df

convert_columns(df_new_annotated_data)

display(df_new_annotated_data)

Converted column 'meta' from string to dictionary.
Converted column 'config' from string to dictionary.
Converted column 'accept' from string to list where applicable.
Converted column 'options' from string to list where applicable.


text,meta,_input_hash,_task_hash,options,_view_id,config,accept,answer,_timestamp,_annotator_id,_session_id
"$1,000 annually in professional development funds (local equivalent)","List(Learning & Development Benefits, Blue State, United Kingdom, 3827033)",-1685902126,-821500820,"List(List(Adoption_and_Fertility, Adoption_and_Fertility), List(Company_Car_Benefits, Company_Car_Benefits), List(Commuting_Benefits, Commuting_Benefits), List(DE&I_Benefits, DE&I_Benefits), List(Emotional_Wellbeing, Emotional_Wellbeing), List(Financial_Wellbeing, Financial_Wellbeing), List(Flexible_Benefits_Budget, Flexible_Benefits_Budget), List(Flexible_Working_Benefits, Flexible_Working_Benefits), List(Healthcare_Benefits, Healthcare_Benefits), List(Learning_&_Development_Benefits, Learning_&_Development_Benefits), List(Meal_Benefits, Meal_Benefits), List(Maternity_Paternity_and_Parental_Leave, Maternity_Paternity_and_Parental_Leave), List(Paid_Time_Off, Paid_Time_Off), List(Perks, Perks), List(Physical_Wellbeing, Physical_Wellbeing), List(Referral_Programs, Referral_Programs), List(Relocation_&_Housing_Programs, Relocation_&_Housing_Programs), List(Remote_Working_Benefits, Remote_Working_Benefits), List(Retirement, Retirement), List(Risk_Benefits, Risk_Benefits), List(Social_Wellbeing, Social_Wellbeing), List(Special_Leave, Special_Leave), List(Vacation_Purchase_Programs, Vacation_Purchase_Programs), List(Variable_Pay_Programs, Variable_Pay_Programs), List(Other, Other))",choice,List(multiple),List(Learning_&_Development_Benefits),accept,1728538009,2024-10-09_14-18-30,2024-10-09_14-18-30
"$1,300 flexible benefit spending account","List(['Financial_Wellbeing'], Deloitte, Canada, 11281619)",999297031,1028077684,"List(List(Adoption_and_Fertility, Adoption_and_Fertility), List(Company_Car_Benefits, Company_Car_Benefits), List(Commuting_Benefits, Commuting_Benefits), List(DE&I_Benefits, DE&I_Benefits), List(Emotional_Wellbeing, Emotional_Wellbeing), List(Financial_Wellbeing, Financial_Wellbeing), List(Flexible_Benefits_Budget, Flexible_Benefits_Budget), List(Flexible_Working_Benefits, Flexible_Working_Benefits), List(Healthcare_Benefits, Healthcare_Benefits), List(Learning_&_Development_Benefits, Learning_&_Development_Benefits), List(Meal_Benefits, Meal_Benefits), List(Maternity_Paternity_and_Parental_Leave, Maternity_Paternity_and_Parental_Leave), List(Paid_Time_Off, Paid_Time_Off), List(Perks, Perks), List(Physical_Wellbeing, Physical_Wellbeing), List(Referral_Programs, Referral_Programs), List(Relocation_&_Housing_Programs, Relocation_&_Housing_Programs), List(Remote_Working_Benefits, Remote_Working_Benefits), List(Retirement, Retirement), List(Risk_Benefits, Risk_Benefits), List(Social_Wellbeing, Social_Wellbeing), List(Special_Leave, Special_Leave), List(Vacation_Purchase_Programs, Vacation_Purchase_Programs), List(Variable_Pay_Programs, Variable_Pay_Programs), List(Other, Other))",choice,List(multiple),List(Flexible_Benefits_Budget),accept,1729662595,2024-10-16_17-46-00,2024-10-16_17-46-00
$10K Sign on Bonus (subject to eligibility),"List(['Variable_Pay_Programs'], Glencore, Australia, 4469596)",683806270,-582917498,"List(List(Adoption_and_Fertility, Adoption_and_Fertility), List(Company_Car_Benefits, Company_Car_Benefits), List(Commuting_Benefits, Commuting_Benefits), List(DE&I_Benefits, DE&I_Benefits), List(Emotional_Wellbeing, Emotional_Wellbeing), List(Financial_Wellbeing, Financial_Wellbeing), List(Flexible_Benefits_Budget, Flexible_Benefits_Budget), List(Flexible_Working_Benefits, Flexible_Working_Benefits), List(Healthcare_Benefits, Healthcare_Benefits), List(Learning_&_Development_Benefits, Learning_&_Development_Benefits), List(Meal_Benefits, Meal_Benefits), List(Maternity_Paternity_and_Parental_Leave, Maternity_Paternity_and_Parental_Leave), List(Paid_Time_Off, Paid_Time_Off), List(Perks, Perks), List(Physical_Wellbeing, Physical_Wellbeing), List(Referral_Programs, Referral_Programs), List(Relocation_&_Housing_Programs, Relocation_&_Housing_Programs), List(Remote_Working_Benefits, Remote_Working_Benefits), List(Retirement, Retirement), List(Risk_Benefits, Risk_Benefits), List(Social_Wellbeing, Social_Wellbeing), List(Special_Leave, Special_Leave), List(Vacation_Purchase_Programs, Vacation_Purchase_Programs), List(Variable_Pay_Programs, Variable_Pay_Programs), List(Other, Other))",choice,List(multiple),List(Variable_Pay_Programs),accept,1730110467,2024-10-24_07-39-25,2024-10-24_07-39-25
$10K Sign on bonus,"List(['Variable_Pay_Programs'], Glencore, Australia, 3951791)",-1383074502,-1183702366,"List(List(Adoption_and_Fertility, Adoption_and_Fertility), List(Company_Car_Benefits, Company_Car_Benefits), List(Commuting_Benefits, Commuting_Benefits), List(DE&I_Benefits, DE&I_Benefits), List(Emotional_Wellbeing, Emotional_Wellbeing), List(Financial_Wellbeing, Financial_Wellbeing), List(Flexible_Benefits_Budget, Flexible_Benefits_Budget), List(Flexible_Working_Benefits, Flexible_Working_Benefits), List(Healthcare_Benefits, Healthcare_Benefits), List(Learning_&_Development_Benefits, Learning_&_Development_Benefits), List(Meal_Benefits, Meal_Benefits), List(Maternity_Paternity_and_Parental_Leave, Maternity_Paternity_and_Parental_Leave), List(Paid_Time_Off, Paid_Time_Off), List(Perks, Perks), List(Physical_Wellbeing, Physical_Wellbeing), List(Referral_Programs, Referral_Programs), List(Relocation_&_Housing_Programs, Relocation_&_Housing_Programs), List(Remote_Working_Benefits, Remote_Working_Benefits), List(Retirement, Retirement), List(Risk_Benefits, Risk_Benefits), List(Social_Wellbeing, Social_Wellbeing), List(Special_Leave, Special_Leave), List(Vacation_Purchase_Programs, Vacation_Purchase_Programs), List(Variable_Pay_Programs, Variable_Pay_Programs), List(Other, Other))",choice,List(multiple),List(Variable_Pay_Programs),accept,1730101924,2024-10-24_07-39-25,2024-10-24_07-39-25
$1500 tuition assistance grant,"List([], Sunbelt Rentals, Canada, 2018556)",-1712899008,100444929,"List(List(Adoption_and_Fertility, Adoption_and_Fertility), List(Company_Car_Benefits, Company_Car_Benefits), List(Commuting_Benefits, Commuting_Benefits), List(DE&I_Benefits, DE&I_Benefits), List(Emotional_Wellbeing, Emotional_Wellbeing), List(Financial_Wellbeing, Financial_Wellbeing), List(Flexible_Benefits_Budget, Flexible_Benefits_Budget), List(Flexible_Working_Benefits, Flexible_Working_Benefits), List(Healthcare_Benefits, Healthcare_Benefits), List(Learning_&_Development_Benefits, Learning_&_Development_Benefits), List(Meal_Benefits, Meal_Benefits), List(Maternity_Paternity_and_Parental_Leave, Maternity_Paternity_and_Parental_Leave), List(Paid_Time_Off, Paid_Time_Off), List(Perks, Perks), List(Physical_Wellbeing, Physical_Wellbeing), List(Referral_Programs, Referral_Programs), List(Relocation_&_Housing_Programs, Relocation_&_Housing_Programs), List(Remote_Working_Benefits, Remote_Working_Benefits), List(Retirement, Retirement), List(Risk_Benefits, Risk_Benefits), List(Social_Wellbeing, Social_Wellbeing), List(Special_Leave, Special_Leave), List(Vacation_Purchase_Programs, Vacation_Purchase_Programs), List(Variable_Pay_Programs, Variable_Pay_Programs), List(Other, Other))",choice,List(multiple),List(Learning_&_Development_Benefits),accept,1730110820,2024-10-24_07-39-25,2024-10-24_07-39-25
"$4,000 per year for mental health support benefits","List(['Emotional_Wellbeing'], Deloitte, Canada, 11031193)",-1723181462,-1597937166,"List(List(Adoption_and_Fertility, Adoption_and_Fertility), List(Company_Car_Benefits, Company_Car_Benefits), List(Commuting_Benefits, Commuting_Benefits), List(DE&I_Benefits, DE&I_Benefits), List(Emotional_Wellbeing, Emotional_Wellbeing), List(Financial_Wellbeing, Financial_Wellbeing), List(Flexible_Benefits_Budget, Flexible_Benefits_Budget), List(Flexible_Working_Benefits, Flexible_Working_Benefits), List(Healthcare_Benefits, Healthcare_Benefits), List(Learning_&_Development_Benefits, Learning_&_Development_Benefits), List(Meal_Benefits, Meal_Benefits), List(Maternity_Paternity_and_Parental_Leave, Maternity_Paternity_and_Parental_Leave), List(Paid_Time_Off, Paid_Time_Off), List(Perks, Perks), List(Physical_Wellbeing, Physical_Wellbeing), List(Referral_Programs, Referral_Programs), List(Relocation_&_Housing_Programs, Relocation_&_Housing_Programs), List(Remote_Working_Benefits, Remote_Working_Benefits), List(Retirement, Retirement), List(Risk_Benefits, Risk_Benefits), List(Social_Wellbeing, Social_Wellbeing), List(Special_Leave, Special_Leave), List(Vacation_Purchase_Programs, Vacation_Purchase_Programs), List(Variable_Pay_Programs, Variable_Pay_Programs), List(Other, Other))",choice,List(multiple),List(Emotional_Wellbeing),accept,1730110173,2024-10-24_07-39-25,2024-10-24_07-39-25
"$5,250 tuition reimbursement per calendar year starting 6 months after the hire date","List(['Perks'], The Sage Group, Canada, 7579714)",310162791,346255136,"List(List(Adoption_and_Fertility, Adoption_and_Fertility), List(Company_Car_Benefits, Company_Car_Benefits), List(Commuting_Benefits, Commuting_Benefits), List(DE&I_Benefits, DE&I_Benefits), List(Emotional_Wellbeing, Emotional_Wellbeing), List(Financial_Wellbeing, Financial_Wellbeing), List(Flexible_Benefits_Budget, Flexible_Benefits_Budget), List(Flexible_Working_Benefits, Flexible_Working_Benefits), List(Healthcare_Benefits, Healthcare_Benefits), List(Learning_&_Development_Benefits, Learning_&_Development_Benefits), List(Meal_Benefits, Meal_Benefits), List(Maternity_Paternity_and_Parental_Leave, Maternity_Paternity_and_Parental_Leave), List(Paid_Time_Off, Paid_Time_Off), List(Perks, Perks), List(Physical_Wellbeing, Physical_Wellbeing), List(Referral_Programs, Referral_Programs), List(Relocation_&_Housing_Programs, Relocation_&_Housing_Programs), List(Remote_Working_Benefits, Remote_Working_Benefits), List(Retirement, Retirement), List(Risk_Benefits, Risk_Benefits), List(Social_Wellbeing, Social_Wellbeing), List(Special_Leave, Special_Leave), List(Vacation_Purchase_Programs, Vacation_Purchase_Programs), List(Variable_Pay_Programs, Variable_Pay_Programs), List(Other, Other))",choice,List(multiple),List(Learning_&_Development_Benefits),accept,1730102283,2024-10-24_07-39-25,2024-10-24_07-39-25
"% company-paid Medical, Vision, &amp; Dental coverage for you and your family with low deductibles and low out-of-pocket expenses.</li>","List(Other, Bandwidth, United States, 10596591)",255310763,2136202395,"List(List(Adoption_and_Fertility, Adoption_and_Fertility), List(Company_Car_Benefits, Company_Car_Benefits), List(Commuting_Benefits, Commuting_Benefits), List(DE&I_Benefits, DE&I_Benefits), List(Emotional_Wellbeing, Emotional_Wellbeing), List(Financial_Wellbeing, Financial_Wellbeing), List(Flexible_Benefits_Budget, Flexible_Benefits_Budget), List(Flexible_Working_Benefits, Flexible_Working_Benefits), List(Healthcare_Benefits, Healthcare_Benefits), List(Learning_&_Development_Benefits, Learning_&_Development_Benefits), List(Meal_Benefits, Meal_Benefits), List(Maternity_Paternity_and_Parental_Leave, Maternity_Paternity_and_Parental_Leave), List(Paid_Time_Off, Paid_Time_Off), List(Perks, Perks), List(Physical_Wellbeing, Physical_Wellbeing), List(Referral_Programs, Referral_Programs), List(Relocation_&_Housing_Programs, Relocation_&_Housing_Programs), List(Remote_Working_Benefits, Remote_Working_Benefits), List(Retirement, Retirement), List(Risk_Benefits, Risk_Benefits), List(Social_Wellbeing, Social_Wellbeing), List(Special_Leave, Special_Leave), List(Vacation_Purchase_Programs, Vacation_Purchase_Programs), List(Variable_Pay_Programs, Variable_Pay_Programs), List(Other, Other))",choice,List(multiple),List(Healthcare_Benefits),accept,1728549758,2024-10-09_14-18-30,2024-10-09_14-18-30
", religion","List(Other, Apellis Pharmaceuticals, United States, 8460584)",240647422,-888323778,"List(List(Adoption_and_Fertility, Adoption_and_Fertility), List(Company_Car_Benefits, Company_Car_Benefits), List(Commuting_Benefits, Commuting_Benefits), List(DE&I_Benefits, DE&I_Benefits), List(Emotional_Wellbeing, Emotional_Wellbeing), List(Financial_Wellbeing, Financial_Wellbeing), List(Flexible_Benefits_Budget, Flexible_Benefits_Budget), List(Flexible_Working_Benefits, Flexible_Working_Benefits), List(Healthcare_Benefits, Healthcare_Benefits), List(Learning_&_Development_Benefits, Learning_&_Development_Benefits), List(Meal_Benefits, Meal_Benefits), List(Maternity_Paternity_and_Parental_Leave, Maternity_Paternity_and_Parental_Leave), List(Paid_Time_Off, Paid_Time_Off), List(Perks, Perks), List(Physical_Wellbeing, Physical_Wellbeing), List(Referral_Programs, Referral_Programs), List(Relocation_&_Housing_Programs, Relocation_&_Housing_Programs), List(Remote_Working_Benefits, Remote_Working_Benefits), List(Retirement, Retirement), List(Risk_Benefits, Risk_Benefits), List(Social_Wellbeing, Social_Wellbeing), List(Special_Leave, Special_Leave), List(Vacation_Purchase_Programs, Vacation_Purchase_Programs), List(Variable_Pay_Programs, Variable_Pay_Programs), List(Other, Other))",choice,List(multiple),List(),accept,1728549751,2024-10-09_14-18-30,2024-10-09_14-18-30
//Monthly fitness allowance,"List(Other, Allegion, United Kingdom, 7774407)",1632033782,1373562783,"List(List(Adoption_and_Fertility, Adoption_and_Fertility), List(Company_Car_Benefits, Company_Car_Benefits), List(Commuting_Benefits, Commuting_Benefits), List(DE&I_Benefits, DE&I_Benefits), List(Emotional_Wellbeing, Emotional_Wellbeing), List(Financial_Wellbeing, Financial_Wellbeing), List(Flexible_Benefits_Budget, Flexible_Benefits_Budget), List(Flexible_Working_Benefits, Flexible_Working_Benefits), List(Healthcare_Benefits, Healthcare_Benefits), List(Learning_&_Development_Benefits, Learning_&_Development_Benefits), List(Meal_Benefits, Meal_Benefits), List(Maternity_Paternity_and_Parental_Leave, Maternity_Paternity_and_Parental_Leave), List(Paid_Time_Off, Paid_Time_Off), List(Perks, Perks), List(Physical_Wellbeing, Physical_Wellbeing), List(Referral_Programs, Referral_Programs), List(Relocation_&_Housing_Programs, Relocation_&_Housing_Programs), List(Remote_Working_Benefits, Remote_Working_Benefits), List(Retirement, Retirement), List(Risk_Benefits, Risk_Benefits), List(Social_Wellbeing, Social_Wellbeing), List(Special_Leave, Special_Leave), List(Vacation_Purchase_Programs, Vacation_Purchase_Programs), List(Variable_Pay_Programs, Variable_Pay_Programs), List(Other, Other))",choice,List(multiple),List(Physical_Wellbeing),accept,1728549749,2024-10-09_14-18-30,2024-10-09_14-18-30


## Create the 'ground-truth' dataset for this model by combining the previous ground-truth with the additional data you've annotated

In [0]:
old_ground_truth = "/textclass_v2_1.jsonl"
df_old_ground_truth = pd.read_json(old_ground_truth, lines=True)
print(df_old_ground_truth.shape)


(456, 12)


## Some checks before training model

### 1. Check both datasets have the same group of Benefit Categories we're training - by looking at the 'options' column

In [0]:
def serialize_options(option):
    if isinstance(option, dict):
        # Sort the dictionary by keys and recursively serialize its values
        sorted_dict = {k: serialize_options(v) for k, v in sorted(option.items())}
        return json.dumps(sorted_dict, sort_keys=True)
    elif isinstance(option, list):
        # Recursively serialize each item in the list
        serialized_list = [serialize_options(item) for item in option]
        # Sort the serialized list to ensure order-independence
        serialized_list_sorted = sorted(serialized_list)
        return json.dumps(serialized_list_sorted)
    else:
        # For other data types, serialize directly
        return json.dumps(option)

# Apply serialization to 'options' columns
df_old_ground_truth['options_serialized'] = df_old_ground_truth['options'].apply(serialize_options)
df_new_annotated_data['options_serialized'] = df_new_annotated_data['options'].apply(serialize_options)

# Convert serialized 'options' to sets
old_options_set = set(df_old_ground_truth['options_serialized'])
new_options_set = set(df_new_annotated_data['options_serialized'])

# Compare the sets
if old_options_set == new_options_set:
    print("Both DataFrames have the same 'options' values. Continue.")
else:
    # Identify differences
    only_in_old = old_options_set - new_options_set
    only_in_new = new_options_set - old_options_set
    
    # Explain differences
    print("Differences found in the 'options' column:")
    
    if only_in_old:
        print("Values present only in the old DataFrame:")
        for item in only_in_old:
            print(json.loads(item))  # Deserialize for human-readable format
    
    if only_in_new:
        print("Values present only in the new DataFrame:")
        for item in only_in_new:
            print(json.loads(item))  # Deserialize for human-readable format
    
    print("It is crucial to have them the same. Check the differences above and fix.")


Both DataFrames have the same 'options' values. Continue.


### 2. Drop those annotated examples that're 'reject' or 'skip' as they wont be used to train the model

In [0]:
def drop_non_accept_answers(df, df_name='DataFrame'):
    # Identify rows where 'answer' is not 'accept'
    condition = df['answer'] != 'accept'
    rows_to_drop = df[condition]
    count_to_drop = rows_to_drop.shape[0]

    if count_to_drop > 0:
        # Create a filtered DataFrame with the rows to drop
        filtered_df = rows_to_drop.copy()

        # Drop the identified rows
        df.drop(rows_to_drop.index, inplace=True)
        print(f"'{df_name}': Dropped {count_to_drop} row(s) where 'answer' is not 'accept'.")
        print(f"Check 'filtered_df_{df_name}' to confirm these are supposed to be dropped.")

        return filtered_df
    else:
        print(f"'{df_name}': No rows to drop. All 'answer' values are 'accept'.")
        return None
    
    # Apply the function to df_old_ground_truth
filtered_df_df_old_ground_truth = drop_non_accept_answers(df_old_ground_truth, df_name='df_old_ground_truth')

# Apply the function to df_new_annotated_data
filtered_df_df_new_annotated_data = drop_non_accept_answers(df_new_annotated_data, df_name='df_new_annotated_data')

'df_old_ground_truth': No rows to drop. All 'answer' values are 'accept'.
'df_new_annotated_data': No rows to drop. All 'answer' values are 'accept'.


### 3. Final check to make sure columns are all same and in consistent format

In [0]:
def compare_and_clean_columns(df1, df2, df1_name='df1', df2_name='df2'):
    columns_df1 = set(df1.columns)
    columns_df2 = set(df2.columns)

    # Check if columns are the same
    if list(df1.columns) == list(df2.columns):
        print("Both DataFrames have the same column names in the same order.")
    else:
        print("DataFrames have different column names or different column orders.")
        
        # Identify extra and missing columns
        extra_in_df1 = columns_df1 - columns_df2
        extra_in_df2 = columns_df2 - columns_df1
        common_columns = columns_df1 & columns_df2

        if extra_in_df1:
            print(f"\nColumns only in {df1_name}:")
            for col in extra_in_df1:
                print(f" - {col}")
        else:
            print(f"\nNo extra columns in {df1_name}.")

        if extra_in_df2:
            print(f"\nColumns only in {df2_name}:")
            for col in extra_in_df2:
                print(f" - {col}")
        else:
            print(f"\nNo extra columns in {df2_name}.")

        # Check for column order differences
        if common_columns:
            ordered_common_df1 = [col for col in df1.columns if col in common_columns]
            ordered_common_df2 = [col for col in df2.columns if col in common_columns]
            if ordered_common_df1 != ordered_common_df2:
                print("\nCommon columns are in different orders:")
                print(f" - {df1_name} order: {ordered_common_df1}")
                print(f" - {df2_name} order: {ordered_common_df2}")
            else:
                print("\nCommon columns are in the same order.")

compare_and_clean_columns(
    df_old_ground_truth, 
    df_new_annotated_data, 
    df1_name='df_old_ground_truth', 
    df2_name='df_new_annotated_data'
)

Both DataFrames have the same column names in the same order.


## Prep training materials

In [0]:
# Combining datasets to create new_ground_truth
df_new_ground_truth = pd.concat([df_old_ground_truth,df_new_annotated_data])

expected_rows = df_old_ground_truth.shape[0] + df_new_annotated_data.shape[0]
actual_rows = df_new_ground_truth.shape[0]

if expected_rows == actual_rows:
    print("The number of rows in df_new_ground_truth is correct.")
else:
    print("The number of rows in df_new_ground_truth is incorrect.")

The number of rows in df_new_ground_truth is correct.


In [0]:
# Save as jsonl as needed to feed into training

ground_truth_path = "/v2.test_ground_truth.jsonl"

df_new_ground_truth.to_json(ground_truth_path,lines=True,orient='records')

In [0]:
# INSTRUCTION: Name the prodigy session you're creating to house the training data for this model (temporarily) e.g. benefits_textcat_v2.2

prodigy_session_name = "benefits_textcat_v2.2"

!python -m prodigy db-in $prodigy_session_name "$df_new_ground_truth" --overwrite

  _torch_pytree._register_pytree_node(
path does not exist:                                                    text  ...                                 options_serialized
0           Options to Buy, Sell and Carry Annual Leave  ...  [{"id":
/bin/bash: line 3: 1: command not found
/bin/bash: line 5: 3: command not found
/bin/bash: line 7: ...: command not found
/bin/bash: line 8: 3428: command not found
/bin/bash: line 10: 3430: command not found
/bin/bash: line 12: 3432: command not found


In [0]:
# Keep training split at 0.1 (90 training : 10 evaluation) unless want more evaluation data

training_data_path = "/Notebooks"

!python -m prodigy data-to-spacy "$training_data_path" --textcat-multilabel $prodigy_session_name --eval-split 0.1

  _torch_pytree._register_pytree_node(
[38;5;4mℹ Using language 'en'[0m
  _torch_pytree._register_pytree_node(
[1m
Components: textcat_multilabel
Merging training and evaluation data for 1 components
[38;5;1m✘ Can't find 'benefits_textcat_v2.2' in database 'SQLite'[0m


In [0]:
# Create a folder in your DBFS to hold the trained model. Since the model is big, prefer to store in DBFS

dbutils.fs.mkdirs('/temp')

True

In [0]:
# Output of your DBFS folder for the model
output_path = Path("/temp")

# dev.spacy and train'spacy files produced from the above 'data-to-spacy' command you ran
train_path = "/train.spacy"
dev_path = "/dev.spacy"

In [0]:
config_path = "/textcat_config_v2.cfg"
local_model_path = "/jdbert-384"

overrides = {
    "paths.train": train_path,
    "paths.dev": dev_path,
    "paths.ground_truth": ground_truth_path,
    "variables.transformer_model_name": local_model_path
}

In [0]:
# Run to train model. View the experiment and use the 'run_id' to call the best model from this training
train(config_path, output_path, overrides=overrides)

[38;5;4mℹ Saving to output directory:
/dbfs/Workspace/Users/justin.ngam@towerswatson.com/1_Benefits/temp[0m
[38;5;4mℹ Using CPU[0m
[1m


Some weights of RobertaModel were not initialized from the model checkpoint at /Workspace/Users/justin.ngam@towerswatson.com/1_Benefits/1_NER/Data/jdbert-384 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  -------------  ----------  ------
  0       0          30.13           1.22        1.83    0.02


Uploading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

  4     200        3977.16          66.68       19.94    0.20


Uploading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

 10     400         303.21          10.38       62.79    0.63


Uploading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

 15     600         263.32           5.33       70.61    0.71


Uploading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

 20     800         181.98           3.17       74.65    0.75


Uploading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

 25    1000         121.68           1.83       81.89    0.82


Uploading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

 30    1200          88.17           1.32       83.74    0.84


Uploading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

 34    1400          68.43           0.85       83.38    0.83
 39    1600          53.64           0.70       84.28    0.84


Uploading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:136)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:136)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:133)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:133)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:730)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:448)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:448)
	at com.databricks.spark.chauffeur.ChauffeurState.cancelExecutio

## Now we've trained and saved our Benefits Text Categorisation model within MLFlow. All that's needed to use this model in the future is the unique 'run_id'! Easy peasy.