## Prediction pipeline 

In [2]:
!pip install pandas
!pip install numpy
!pip install torch
!pip install scikit-learn
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://artifactory.alight.com/artifactory/api/pypi/python-pypi-remote/simple


In [9]:
import pandas as pd
import numpy as np
import torch
import re
import pickle
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertForSequenceClassification, BertTokenizer
import spacy
import en_core_web_sm
import string
nlp = spacy.load("en_core_web_sm")

### text preprocess

In [10]:
def count_intnt_entits(text):
    doc = nlp(text)
    intents = [token.text for token in doc if token.pos_ == 'VERB']
    entities = [token.text for token in doc if token.pos_ in {'NOUN', 'PROPN', 'ADJ', 'NUM', 'ADV'}]
    return len(intents), len(entities)

def extract_ner_entities(sentence):
    doc = nlp(sentence)
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    return entities

def length_entities(list_entities):
    if (list_entities==np.nan or list_entities==None or list_entities==''):
        return 0
    else:
        return len(list_entities)
    
def filter_named_entities(text):
    # Process the text using Spacy
    doc = nlp(text)
    # Filter out named entities (ORG, PERSON, and GPE tags)
    filtered_words = [token.text for token in doc if token.ent_type_ not in ['ORG', 'PERSON', 'GPE', "LOC", "FAC"]]
    # Join the filtered words back into a string
    filtered_text = ' '.join(filtered_words)
    return filtered_text

list_1 = ['ira','RMD','HRdirect','livechat','what is my hsa','P45','Payslip?',
    'sps','F80.2','ub','What is YSA','Paystub please','Sh','mfv','C-128','ax','no is hsa','FormL564','HIS','cif','GreT','YSACard',
    'Heli','RxPCN','403(b)','Hsa yes or no','ypr','Gv','ONA?','What is UHC?','HC-2','uo','what is 4DX?','osh','what is my hsa?',
    'sPRAVATO','sdr','RMD’s','coverage?How','This is for my hsa','pto?','A&DD','childcareplus','fs','mbi','Is that my lowesbenefit.com',
    'hra yes','mri?']
list_2 = [word.lower() for word in list_1]

def text_preprocess(col, list_2):
    df = pd.DataFrame({ 'text': col })
    df = df.drop_duplicates()
    df[['no_of_intents', 'no_of_entities']] = df.apply(lambda x: pd.Series(count_intnt_entits(x['text'])), axis=1)  

    df['ner_enities'] = ''
    df.loc[df['text']!='', 'ner_enities'] = df.loc[df['text']!='', 'text'].apply(extract_ner_entities)
    df['len_ner_enities'] = df['ner_enities'].apply(length_entities)
    df3 = df[df['len_ner_enities']>0]
    df3['text'] = df3['text'].apply(filter_named_entities)
    df6 = pd.concat([df[df['len_ner_enities']==0], df3], axis = 0)
    df6 = df6.drop(['no_of_intents','no_of_entities','ner_enities','len_ner_enities'], axis=1)

    df6['text'] = df6['text'].str.strip()
    
    return df6['text'].to_list()


In [55]:
def clean_text(text_list):
    # Clean the text
    text_list = text_preprocess(text_list, list_2)
    text_list = [text for text in text_list if text.strip() and not set(text).issubset(set(string.punctuation + string.whitespace))]
    text_list = [x.lower() for x in text_list]
    # Define a translation table to replace punctuation and special characters with empty string
    translator = str.maketrans(string.punctuation + "_", " " * len(string.punctuation + "_"))
    # Loop through each text in the list and clean it
    cleaned_list = []
    for text in text_list:
        # Replace punctuation and special characters with empty string
        cleaned_text = text.translate(translator)
        # Remove any remaining special characters, punctuation, or whitespaces
        cleaned_text = ' '.join(cleaned_text.split())
        cleaned_list.append(cleaned_text)
    
    return cleaned_list

In [56]:
def predict_category(list_of_texts, model_path, category_name):
#     cleaned_texts = [clean_text(text) for text in list_of_texts]
    list_of_texts = [x.lower() for x in list_of_texts]
    cleaned_texts = clean_text(list_of_texts)
    # Set the random seed for NumPy
    np.random.seed(42)

    # Set the random seed for PyTorch
    torch.manual_seed(42)

    # Load the model and configuration from the pickle file
    with open(model_path, "rb") as f:
        config, state_dict = torch.load(f)

    # Initialize a new model object with the loaded configuration
    model = BertForSequenceClassification(config)

    # Load the saved state dictionary into the model
    model.load_state_dict(state_dict)

    # Load the tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # Prepare the input data
    encodings = tokenizer(cleaned_texts, truncation=True, padding=True)

    # Create the dataset
    dataset = torch.utils.data.TensorDataset(torch.tensor(encodings['input_ids']), torch.tensor(encodings['attention_mask']))

    # Create the DataLoader for the new data
    loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False)

    predictions = []
    probabilities = []

    with torch.no_grad():
        model.eval()

        for batch in loader:
            input_ids, attention_mask = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=1)
            probabilities.extend(probs[:, 1].tolist())
            predictions.extend(torch.argmax(logits, dim=1).tolist())

    # Create a new DataFrame with the predictions and probabilities
    new_data = pd.DataFrame()
    new_data['text'] = cleaned_texts
    new_data['predictions'] = pd.Series(predictions).apply(lambda x: category_name if x==1 else 'Other').values
    new_data['probability'] = probabilities

    return new_data

## Elder care prediction

In [57]:
list_of_texts = ["HmCstmElderCarePlusLandingPageOpen", "senior-citizen?care expense reimbursement", 
                 "Grey generation care home", "elderly*care plus","golden(agers care service required",
                 "aging care home", "retirees care reimbirsement", "third age population care community home",
                 "third age community care home","age related care home","care for old","care for older people",
                 "senior assistance required","Daycare@expense reimbursement","adult daycare required",
                 "baby care licensed","olders care home required", "oldsters care home","geriatric care home",
                 "contentPage 2023 Eldercare!!!!!!!!!!!!!****@_Subsidy","Elder statesmen care",
                 "Elder women care","Silver generation care",
                 "contentPage {}[]/\|?><,.;:!@#+\t\n\r\f\v 2023 Elder care Subsidy","gerontology care",
                 "Elderly Care Plus Information", "care for older people","age related care" ]

len(list_of_texts)

28

In [58]:
df = predict_category(list_of_texts, model_path="EC_model_outer_combined_texts_data_v5/EC_model.pth", category_name='Elder care')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['text'] = df3['text'].apply(filter_named_entities)


In [59]:
df

Unnamed: 0,text,predictions,probability
0,hmcstmeldercarepluslandingpageopen,Other,7.6e-05
1,senior citizen care expense reimbursement,Elder care,0.999992
2,elderly care plus,Elder care,0.999992
3,golden agers care service required,Other,0.000124
4,aging care home,Elder care,0.999991
5,retirees care reimbirsement,Elder care,0.999963
6,age related care home,Elder care,0.999798
7,care for old,Elder care,0.998361
8,care for older people,Elder care,0.999992
9,senior assistance required,Elder care,0.999971


In [60]:
df['text'].to_list()

['hmcstmeldercarepluslandingpageopen',
 'senior citizen care expense reimbursement',
 'elderly care plus',
 'golden agers care service required',
 'aging care home',
 'retirees care reimbirsement',
 'age related care home',
 'care for old',
 'care for older people',
 'senior assistance required',
 'daycare expense reimbursement',
 'adult daycare required',
 'baby care licensed',
 'olders care home required',
 'oldsters care home',
 'elder statesmen care',
 'elder women care',
 'silver generation care',
 'gerontology care',
 'elderly care plus information',
 'age related care',
 'third age population care community home',
 'third age community care home',
 'contentpage 2023 eldercare subsidy',
 'contentpage 2023 elder care subsidy']

### Child care prediction

In [61]:
list_of_texts_cc = ["!@#$%^&*()_+"," ","_","","","HmCstmChildCarePlusLandingPageOpen", 
                    "foster care expense reimbursement"
                    ,"pre school",
                 "foster care reimbursement","require nursery school for child","want after school care",
                "infant care plus","want info about day nursery",
                 "day care reimbirsement", "kindergarten facility required","looking for infant school",
                "Day care reimbursement", "Daycare reimbursement", "Daycare expense reimbursement", 
                    "baby care licensed",
                  "@#^&*contentPage 2023 children care", "!!!!!!!!!!!!!****@_looking for creche",
                 "want childcarer","require info about playgroup","want someone for child minding",
                 "looking for baby sitter", "{}[]/\|?><,.;:!@#+\t\n\r\f\v women for nanny care", 
                    "someone require for child supervision",
                 "want someone for toddler care"]
len(list_of_texts_cc2)

29

In [62]:
df2 = predict_category(list_of_texts_cc2, model_path="CC_model_outer_combined_texts_data_v3/CC_model.pth", category_name='Child care')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['text'] = df3['text'].apply(filter_named_entities)


In [63]:
df2

Unnamed: 0,text,predictions,probability
0,hmcstmchildcarepluslandingpageopen,Child care,0.910009
1,foster care expense reimbursement,Child care,0.999994
2,pre school,Child care,0.999932
3,foster care reimbursement,Child care,0.999995
4,require nursery school for child,Child care,0.999987
5,want after school care,Child care,0.999994
6,infant care plus,Child care,0.959637
7,day care reimbirsement,Child care,0.999995
8,kindergarten facility required,Child care,0.999962
9,looking for infant school,Other,0.000182


In [64]:
df2['text'].to_list()

['hmcstmchildcarepluslandingpageopen',
 'foster care expense reimbursement',
 'pre school',
 'foster care reimbursement',
 'require nursery school for child',
 'want after school care',
 'infant care plus',
 'day care reimbirsement',
 'kindergarten facility required',
 'looking for infant school',
 'day care reimbursement',
 'baby care licensed',
 'looking for creche',
 'want childcarer',
 'require info about playgroup',
 'want someone for child minding',
 'looking for baby sitter',
 'women for nanny care',
 'someone require for child supervision',
 'want someone for toddler care',
 'want info about day nursery',
 'reimbursement',
 'expense reimbursement',
 'contentpage 2023 children care']