### Use Case 1: Intent Referral - using Bert embeddings & ML with cross validation

#### DC code 9582 (unexpected customer delay)

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import time
import re
import random
import string
from pathlib import Path

from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold, cross_validate, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

import torch
from tqdm import tqdm
from transformers import BertTokenizer
from transformers import BertModel

In [28]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 50)

In [29]:
#Read data
base_path = Path("/home/jupyter/deemed_consent")
data_dir = Path("data")
data_path = base_path / data_dir / "deemed_consent_preprocessed_1 1.csv"
embedding_path = base_path / data_dir / "embeddings_9582.npy"

In [8]:
data_df=pd.read_csv(data_path)
top_10_reason_codes=pd.DataFrame({"obs":data_df['REASON_CODE'].value_counts().head(10), "obs_pct": (100 * data_df['REASON_CODE'].value_counts().head(10) / len(data_df)).round(1)})
print(top_10_reason_codes)

               obs  obs_pct
REASON_CODE                
2002         36737     31.2
3007         16576     14.1
9582         14073     11.9
2047          7392      6.3
3005          6025      5.1
2064          4483      3.8
2042          4457      3.8
3010          3416      2.9
3011          3415      2.9
3001          3133      2.7


#### select the dc code to analyse

In [9]:
dc_code = 9582 # top3: 2002, 3007, or, 9582

In [10]:
reduced_df= data_df[data_df['REASON_CODE']==dc_code]

In [11]:
#training_df=reason_df[reason_df['training_data'] == 1]
count_class_1 = (reduced_df['draft_referral_flag'] == 1).sum()
print("Number of rows with class 1:", count_class_1)
count_class_0 = (reduced_df['draft_referral_flag'] == 0).sum()
print("Number of rows with class 0:", count_class_0)

Number of rows with class 1: 2587
Number of rows with class 0: 11486


In [12]:
ratio = count_class_0 / count_class_1
print(f"ratio of intents referred back: {round(ratio,2)}")

ratio of intents referred back: 0.18


### preprocess and clean the delay notes

In [13]:
# look for missing data
print(reduced_df['DELAY_NOTES_CLEANED'].isna().sum())
null_count = reduced_df['DELAY_NOTES_CLEANED'].isna().sum()
print(null_count)

0
0


In [14]:
# Convert to string and analyze
valid_notes = reduced_df['DELAY_NOTES_CLEANED'].fillna('')
empty_strings = (valid_notes.str.strip() == '').sum()
total_rows = len(reduced_df)
print(f"Null values: {null_count}")
print(f"Empty strings: {empty_strings}")
print(f"Valid notes: {total_rows - null_count - empty_strings}")

Null values: 0
Empty strings: 0
Valid notes: 14073


In [15]:
def count_words_safely(text):
    """
    Safely counts words in text, handling null values and non-string inputs
    """
    if pd.isna(text):  # Handle null values
        return 0
    try:
        return len(str(text).split())
    except AttributeError:
        print(f"Unexpected data type: {type(text)}")
        return 0

# word counting code
no_eng_words_arr = []
for text in reduced_df['DELAY_NOTES_CLEANED']:
    no_words = count_words_safely(text)
    no_eng_words_arr.append(no_words)


In [16]:
# replace Nan with ""
reduced_df['DELAY_NOTES_CLEANED'] = reduced_df['DELAY_NOTES_CLEANED'].fillna("")

In [17]:
print(f"mean words: {round(np.mean(no_eng_words_arr),1)}, max {np.max(no_eng_words_arr)}, min {np.min(no_eng_words_arr)}")

mean words: 79.8, max 269, min 2


#### create bert embeddings

In [21]:
# Load bert-base-uncased
tokenizer = BertTokenizer.from_pretrained("/home/jupyter/deemed_consent/bert-base-uncased")
model = BertModel.from_pretrained("/home/jupyter/deemed_consent/bert-base-uncased")

def get_bert_embeddings(texts, max_length=250):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    
    embeddings = []
    batch_size = 32
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding='max_length', truncation=True, 
                          max_length=max_length, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Get CLS token embeddings
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
    
    return np.vstack(embeddings)

In [22]:
X = reduced_df['DELAY_NOTES_CLEANED']
y = reduced_df['draft_referral_flag']

In [23]:
# Convert X data to lists
X_list = X.tolist()

In [24]:
# Generate BERT embeddings
X_emb = get_bert_embeddings(X_list)

100%|██████████| 440/440 [1:25:03<00:00, 11.60s/it]


#### save the embeddings

In [30]:
# Save to a .npy file
np.save(embedding_path, X_emb)

#### load embeddings (i.e. short cut if embeddings already exist)

In [31]:
# Load the embeddings back
X_loaded_emb = np.load(embedding_path)

#### cross validate with xgb

In [32]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [33]:
model = XGBClassifier(scale_pos_weight=ratio, n_estimators=100)

In [34]:
cv_results = cross_validate(model, X_loaded_emb, y, cv=cv, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'neg_log_loss'])

In [35]:
print(cv_results)

{'fit_time': array([15.63823891,  9.09886622,  9.09165359,  9.45264077,  9.82465291]), 'score_time': array([0.05791235, 0.03008819, 0.03036141, 0.02866292, 0.0289371 ]), 'test_accuracy': array([0.81740675, 0.8348135 , 0.81882771, 0.82658138, 0.83759773]), 'test_precision': array([0.55725191, 0.65354331, 0.62162162, 0.61290323, 0.67741935]), 'test_recall': array([0.13799622, 0.16468254, 0.12849162, 0.14728682, 0.16766467]), 'test_f1': array([0.22121212, 0.26307448, 0.21296296, 0.2375    , 0.2688    ]), 'test_roc_auc': array([0.75783184, 0.78902918, 0.79451126, 0.78457633, 0.78314491]), 'test_neg_log_loss': array([-0.60130259, -0.51430726, -0.55105206, -0.54394054, -0.52492579])}


In [43]:
print(f"Mean accuracy: {round(cv_results['test_accuracy'].mean(),3)}")

Mean accuracy: 0.827


In [45]:
print(f"Mean precision: {round(cv_results['test_precision'].mean(),3)}")

Mean precision: 0.625


In [46]:
print(f"Mean recall: {round(cv_results['test_recall'].mean(),3)}")

Mean recall: 0.149
