In [50]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier, DMatrix
from sklearn.metrics import classification_report
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
from collections import defaultdict
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from tqdm import tqdm
from spacy.tokens import DocBin

In [4]:
def merge_columns(row):
    return ' '.join(str(cell) for cell in row if not pd.isna(cell))

In [3]:
NPS_CATEGORIES = pd.read_csv('/content/Categorization_NPS_DATA.csv', header = 1)
CORP_ENROLEES_RESPONSES = pd.read_csv('/content/CORP_ENROLLEES_NPS.csv')

In [5]:
NPS_CATEGORIES_COI = NPS_CATEGORIES[['Response ID', 'Broad Category', 'Sub-driver']]
CORP_ENROLEES_RESPONSES_COI = CORP_ENROLEES_RESPONSES[['Response ID', 'We are sorry. Please tell us why you chose that rating.',
       '   Amazing. Please, tell us why you chose that rating.',
       '   Please tell us why you chose that rating.']]

CORP_ENROLLEES_NPS = pd.merge(CORP_ENROLEES_RESPONSES_COI, NPS_CATEGORIES_COI,
                              left_on  = 'Response ID', right_on = 'Response ID')

In [6]:
CORP_ENROLLEES_NPS['CUSTOMER_RESPONSE'] = CORP_ENROLLEES_NPS[['We are sorry. Please tell us why you chose that rating.',
       '   Amazing. Please, tell us why you chose that rating.',
       '   Please tell us why you chose that rating.']].apply(merge_columns, axis=1)
CORP_ENROLLEES_NPS.drop(columns=['We are sorry. Please tell us why you chose that rating.',
       '   Amazing. Please, tell us why you chose that rating.',
       '   Please tell us why you chose that rating.'], inplace=True)

In [7]:
CORP_ENROLLEES_NPS = CORP_ENROLLEES_NPS[CORP_ENROLLEES_NPS['Broad Category'].notna()]

In [8]:
CORP_ENROLLEES_NPS['Broad Category'].unique()

array(['Poor RCC Quality', 'Good RCC Quality',
       'Adequate provider network', 'Great Service',
       'Poor Provider Quality', 'Good Provider Quality',
       'Delayed Medication Pickup', 'Limited Health Coverage',
       'Good consultation attitude', 'Poor customer education',
       'Limited Provider Network', 'Long Provider Wait Time',
       'Good listening skills', 'Delayed Medication Delivery',
       'Great Telemedicine', 'Care Denial', 'Test result delay',
       'Tariff issue', 'Quick medication delivery', 'Good staff attitude',
       'Delayed medication delivery', 'Poor provider quality',
       'Refund issue', 'No medication delivery', 'Telemedicine',
       'Receipt of feedback', 'Limited health coverage', 'Care denial',
       'Clean environment', 'Poor RCC quality',
       'Poor provider onboarding process', 'Sufficient Health Coverage',
       'Long provider wait time', 'Affordable care', 'Delayed refund',
       'Broad Provider Network', 'Onboarding issue',
      

# Mapping data to Topics.
===========================

### Merging Rationale:
- Healthcare Service Quality: Merges 'Quality of Care' and 'Service Aspects' as both relate to the patient's experience and satisfaction with the healthcare service's quality and responsiveness.

- Healthcare Access and Infrastructure: Combines 'Provider Network' and 'Coverage' as both are crucial for ensuring access to healthcare services and can often intersect in discussions about what services are accessible and under what conditions.

- Healthcare Administration: Keeps as a separate category since administrative issues might span various aspects, from paperwork to processing times, that are not directly related to the quality or access but to the efficiency and experience of healthcare administration.

- Telemedicine: Keeps as a separted category as we need to segment the issues related to our telemedicine services.

- Medication Related issues: This is related to the medication quality and medication delievery.

By merging these labels, we focus on broader, more impactful categories that can help in analyzing the data more effectively, especially if the original labels are too granular or if there's considerable overlap in the context they represent

In [9]:

# Mapping of labels to new categories
label_mapping = {
    **dict.fromkeys(['Good RCC Quality', 'Good Provider Quality', 'Good consultation attitude', 'Good listening skills', 'Good staff attitude'], 'Quality of Care'),
    **dict.fromkeys(['Poor RCC Quality', 'Poor RCC quality', 'Poor Provider Quality', 'Poor customer education', 'Poor provider quality', 'Poor service'], 'Quality of Care'),
    **dict.fromkeys(['Great Service', 'Great Telemedicine', 'Great telemedicine', 'Clean environment', 'Receipt of feedback'], 'Service Aspects'),
    **dict.fromkeys(['Care Denial', 'Care denial', 'Delayed Refund', 'Poor verification'], 'Service Aspects'),
    **dict.fromkeys(['Adequate provider network', 'Broad Provider Network'], 'Provider Network'),
    **dict.fromkeys(['Limited Provider Network', 'Limited provider network'], 'Provider Network'),
    **dict.fromkeys(['Delayed Medication Pickup', 'Delayed Medication Delivery', 'Delayed medication delivery', 'Quick medication delivery', 'No medication delivery'], 'Medication-Related Issues'),
    'Medication Quality': 'Medication-Related Issues',
    **dict.fromkeys(['Telemedicine', 'Great Telemedicine', 'Great telemedicine'], 'Telemedicine'),
    **dict.fromkeys(['Sufficient Health Coverage', 'Affordable care'], 'Coverage'),
    **dict.fromkeys(['Limited Health Coverage', 'Limited health coverage'], 'Coverage'),
    **dict.fromkeys(['Long Provider Wait Time', 'Long provider wait time', 'Short Provider Wait Time', 'Poor provider onboarding process', 'Onboarding issue', 'Onboarding family issue', 'Receipt of feedback', 'Test result delay'], 'Administrative and Process Issues'),
    **dict.fromkeys(['Tariff issue', 'Refund issue', 'Delayed refund'], 'Administrative and Process Issues'),
    'App issues': 'Service Aspects'
}

# Replacing the original labels with the new categories in the DataFrame
CORP_ENROLLEES_NPS['Main Topics'] = CORP_ENROLLEES_NPS['Broad Category'].replace(label_mapping)

In [10]:
label_mapping = {
    'Administrative and Process Issues': 'Healthcare Administration',
    'Coverage': 'Healthcare Access and Infrastructure',
    'Medication-Related Issues': 'Medication-Related Issues',
    'Provider Network': 'Healthcare Access and Infrastructure',
    'Quality of Care': 'Healthcare Service Quality',
    'Service Aspects': 'Healthcare Service Quality',
    'Telemedicine': 'Telemedicine'
}

# Apply the mapping to create a new column with merged labels
CORP_ENROLLEES_NPS['Merged Label'] = CORP_ENROLLEES_NPS['Main Topics'].map(label_mapping)

In [12]:
CORP_ENROLLEES_NPS['Merged Label'].unique()

array(['Healthcare Service Quality',
       'Healthcare Access and Infrastructure',
       'Medication-Related Issues', 'Healthcare Administration',
       'Telemedicine'], dtype=object)

In [14]:
nltk.download('stopwords')
nltk.download('wordnet')
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = text.split()
    # Remove stopwords and lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [15]:
CORP_ENROLLEES_NPS['Clean_Response'] = [clean_text(text) for text in CORP_ENROLLEES_NPS['CUSTOMER_RESPONSE']]

#Modelling

In [16]:
label_encoder = LabelEncoder()
CORP_ENROLLEES_NPS['category_encoded'] = label_encoder.fit_transform(CORP_ENROLLEES_NPS['Merged Label'])
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
clean_vectors = vectorizer.fit_transform(CORP_ENROLLEES_NPS['Clean_Response'])

In [18]:
xgb_model = XGBClassifier(use_label_encoder=False)
param_grid = {'objective': 'multi:softprob', 'num_class': 5, 'min_child_weight': 5, 'max_depth': 7, 'learning_rate': 0.01, 'lambda': 1, 'gamma': 0, 'eval_metric': 'mlogloss', 'colsample_bytree': 0.7, 'alpha': 0}

In [19]:
X_train, X_test, y_train, y_test = train_test_split(clean_vectors, CORP_ENROLLEES_NPS['category_encoded'], test_size=0.2)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train, y_train, test_size=0.2)

dtrain = DMatrix(X_train2, label=y_train2)
dtest = DMatrix(X_test2, label=y_test2)
evallist = [(dtrain, 'train'), (dtest, 'eval')]

dtrain = DMatrix(X_train2, label=y_train2)
dtest = DMatrix(X_test2, label=y_test2)
evallist = [(dtrain, 'train'), (dtest, 'eval')]

In [20]:
trained_model = xgb.train(
                param_grid
                ,dtrain
                ,evals = evallist
                ,num_boost_round=2000
                ,early_stopping_rounds=10)

[0]	train-mlogloss:1.59411	eval-mlogloss:1.59521
[1]	train-mlogloss:1.57831	eval-mlogloss:1.58102
[2]	train-mlogloss:1.56314	eval-mlogloss:1.56720
[3]	train-mlogloss:1.54804	eval-mlogloss:1.55327
[4]	train-mlogloss:1.53503	eval-mlogloss:1.54121
[5]	train-mlogloss:1.52162	eval-mlogloss:1.52912
[6]	train-mlogloss:1.50731	eval-mlogloss:1.51590
[7]	train-mlogloss:1.49421	eval-mlogloss:1.50401
[8]	train-mlogloss:1.48055	eval-mlogloss:1.49138
[9]	train-mlogloss:1.46793	eval-mlogloss:1.47983
[10]	train-mlogloss:1.45516	eval-mlogloss:1.46811
[11]	train-mlogloss:1.44260	eval-mlogloss:1.45645
[12]	train-mlogloss:1.43015	eval-mlogloss:1.44488
[13]	train-mlogloss:1.41833	eval-mlogloss:1.43394
[14]	train-mlogloss:1.40672	eval-mlogloss:1.42347
[15]	train-mlogloss:1.39440	eval-mlogloss:1.41228
[16]	train-mlogloss:1.38266	eval-mlogloss:1.40171
[17]	train-mlogloss:1.37214	eval-mlogloss:1.39227
[18]	train-mlogloss:1.36099	eval-mlogloss:1.38230
[19]	train-mlogloss:1.34994	eval-mlogloss:1.37224
[20]	train

In [22]:
y_pred = trained_model.predict(DMatrix(X_test)).argmax(axis=1)
y_pred_train = trained_model.predict(DMatrix(X_train)).argmax(axis=1)

print("XGBoost Classification Report with Evaluation Set:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("\n\nXGBoost Classification Report for training set:")
print(classification_report(y_train, y_pred_train, target_names=label_encoder.classes_))

XGBoost Classification Report with Evaluation Set:
                                      precision    recall  f1-score   support

Healthcare Access and Infrastructure       0.00      0.00      0.00        13
           Healthcare Administration       0.50      0.28      0.36        25
          Healthcare Service Quality       0.84      0.98      0.90       194
           Medication-Related Issues       0.52      0.58      0.55        24
                        Telemedicine       1.00      0.07      0.13        14

                            accuracy                           0.79       270
                           macro avg       0.57      0.38      0.39       270
                        weighted avg       0.75      0.79      0.74       270



XGBoost Classification Report for training set:
                                      precision    recall  f1-score   support

Healthcare Access and Infrastructure       0.79      0.27      0.41        55
           Healthcare Administration 

#Sentiment Analysis

In [23]:
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [24]:
def classify_sentiment(sentence):
    sentiment_scores = analyzer.polarity_scores(sentence)
    if sentiment_scores['compound'] >= 0.05:
        return 'Positive'
    elif sentiment_scores['compound'] <= -0.05:
        return 'Negative'
CORP_ENROLLEES_NPS['SENTIMENT_CLASSIFICATION'] = CORP_ENROLLEES_NPS['CUSTOMER_RESPONSE'].apply(classify_sentiment)

In [25]:
CORP_ENROLLEES_NPS['SENTIMENT_CLASSIFICATION'].value_counts()

Positive    749
Negative    209
Name: SENTIMENT_CLASSIFICATION, dtype: int64

##Subtopics Classification

In [30]:
CORP_ENROLLEES_NPS = CORP_ENROLLEES_NPS[CORP_ENROLLEES_NPS['Sub-driver'].notna()]

In [31]:
def clean_and_group_subdrivers(text):
    cleaned_text = text.lower().replace('-', ' ').replace('/', ' ').replace('_', ' ')
    cleaned_text = ' '.join(cleaned_text.split())

    if (("inadequate health benefits" in cleaned_text) or ("limited understanding of health benefits" in cleaned_text) or ("limited health benefit" in cleaned_text) or ("adequate health benefits" in cleaned_text) or ("sufficient health benefits" in cleaned_text) or ('consultation time too short' in cleaned_text)):
        return "health benefits"
    elif ("pa code" in cleaned_text):
        return "pa code"
    elif (("short wait time" in cleaned_text ) or ('long wait time' in cleaned_text)):
      return "waiting time"
    elif (('swift response' in cleaned_text) or ('not responsive' in cleaned_text) or ('delayed response' in cleaned_text)):
      return "Responsivenss"
    elif(('cheap medication' in cleaned_text) or ('quality medication' in cleaned_text) or ('delayed medication pickup' in cleaned_text) or ('medication pickup issue' in cleaned_text) or ('quick medication delivery'in cleaned_text) or ('delayed medication delivery' in cleaned_text) or ('substandard medication' in cleaned_text)):
      return 'medication'
    elif(('delayed feedback' in cleaned_text) or ('poor feedback tat' in cleaned_text) or ('lack of feedback' in cleaned_text)):
      return "feedback"
    elif (( 'poor issue resolution' in cleaned_text) or ('poor resolution process' in cleaned_text)):
      return "resolution"
    elif(('lack of knowledge of delivery status' in cleaned_text) or  ('lack of delivery status' in cleaned_text)):
      return "delivery status"
    elif(('poor staff attitude' in cleaned_text)  or ('poor treatment' in cleaned_text) or ('good staff attitude' in cleaned_text)):
      return "staff attitude"
    else:
      return cleaned_text
# Kolade's feedback: Merge PA_CODE with waiting time.
# Apply the function to the 'Sub-driver' column and create a new 'Grouped_Sub-driver' column
CORP_ENROLLEES_NPS['Grouped_Sub-driver'] = CORP_ENROLLEES_NPS['Sub-driver'].apply(clean_and_group_subdrivers)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CORP_ENROLLEES_NPS['Grouped_Sub-driver'] = CORP_ENROLLEES_NPS['Sub-driver'].apply(clean_and_group_subdrivers)


In [34]:
class_counts = CORP_ENROLLEES_NPS['Grouped_Sub-driver'].value_counts()
classes_to_keep = class_counts[class_counts > 20].index.tolist()
filtered_NPS_data = CORP_ENROLLEES_NPS[CORP_ENROLLEES_NPS['Grouped_Sub-driver'].isin(classes_to_keep)]

In [40]:
clean_vectors = vectorizer.fit_transform(filtered_NPS_data['CUSTOMER_RESPONSE'])
label_encoder = LabelEncoder()
filtered_NPS_data['filtered_category_encoded'] = label_encoder.fit_transform(filtered_NPS_data['Grouped_Sub-driver'])
vectorizer = TfidfVectorizer(ngram_range=(1, 3))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_NPS_data['filtered_category_encoded'] = label_encoder.fit_transform(filtered_NPS_data['Grouped_Sub-driver'])


In [41]:
param_grid = {'objective': 'multi:softprob', 'num_class': 7, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.01, 'lambda': 1, 'gamma': 0, 'eval_metric': 'mlogloss', 'colsample_bytree': 0.7, 'alpha': 0}

In [42]:
X_train, X_test, y_train, y_test = train_test_split(clean_vectors, filtered_NPS_data['filtered_category_encoded'], test_size=0.2)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train, y_train, test_size=0.2)

dtrain = DMatrix(X_train2, label=y_train2)
dtest = DMatrix(X_test2, label=y_test2)
evallist = [(dtrain, 'train'), (dtest, 'eval')]

dtrain = DMatrix(X_train2, label=y_train2)
dtest = DMatrix(X_test2, label=y_test2)
evallist = [(dtrain, 'train'), (dtest, 'eval')]

In [43]:
trained_model = xgb.train(
                param_grid
                ,dtrain
                ,evals = evallist
                ,num_boost_round=2000
                ,early_stopping_rounds=10)

[0]	train-mlogloss:1.92734	eval-mlogloss:1.92976
[1]	train-mlogloss:1.90939	eval-mlogloss:1.91409
[2]	train-mlogloss:1.89515	eval-mlogloss:1.90229
[3]	train-mlogloss:1.87992	eval-mlogloss:1.88863
[4]	train-mlogloss:1.86454	eval-mlogloss:1.87468
[5]	train-mlogloss:1.84973	eval-mlogloss:1.86334
[6]	train-mlogloss:1.83449	eval-mlogloss:1.85063
[7]	train-mlogloss:1.81952	eval-mlogloss:1.83859
[8]	train-mlogloss:1.80447	eval-mlogloss:1.82626
[9]	train-mlogloss:1.79141	eval-mlogloss:1.81617
[10]	train-mlogloss:1.77920	eval-mlogloss:1.80693
[11]	train-mlogloss:1.76453	eval-mlogloss:1.79433
[12]	train-mlogloss:1.75233	eval-mlogloss:1.78340
[13]	train-mlogloss:1.73789	eval-mlogloss:1.77088
[14]	train-mlogloss:1.72584	eval-mlogloss:1.76223
[15]	train-mlogloss:1.71225	eval-mlogloss:1.75126
[16]	train-mlogloss:1.69958	eval-mlogloss:1.74072
[17]	train-mlogloss:1.68726	eval-mlogloss:1.73065
[18]	train-mlogloss:1.67468	eval-mlogloss:1.71966
[19]	train-mlogloss:1.66216	eval-mlogloss:1.70983
[20]	train

In [44]:
y_pred_train = trained_model.predict(DMatrix(X_train)).argmax(axis=1)
print("XGBoost Classification Report for training set:")
print(classification_report(y_train, y_pred_train, target_names=label_encoder.classes_))

XGBoost Classification Report for training set:
                 precision    recall  f1-score   support

  Responsivenss       0.91      0.97      0.94       276
       feedback       0.92      0.48      0.63        25
health benefits       0.80      0.43      0.56        28
     medication       0.81      0.89      0.85       120
        pa code       0.80      0.83      0.81        76
 staff attitude       0.95      0.78      0.86        27
   waiting time       0.87      0.68      0.76        19

       accuracy                           0.87       571
      macro avg       0.87      0.72      0.77       571
   weighted avg       0.87      0.87      0.86       571



In [47]:
y_pred = trained_model.predict(DMatrix(X_test)).argmax(axis=1)

# Print classification report
print("XGBoost Classification Report with Evaluation Set:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

XGBoost Classification Report with Evaluation Set:
                 precision    recall  f1-score   support

  Responsivenss       0.84      0.97      0.90        74
       feedback       0.50      0.14      0.22         7
health benefits       0.50      0.14      0.22         7
     medication       0.89      0.78      0.83        32
        pa code       0.52      0.69      0.59        16
 staff attitude       0.50      0.50      0.50         4
   waiting time       0.00      0.00      0.00         3

       accuracy                           0.78       143
      macro avg       0.54      0.46      0.47       143
   weighted avg       0.75      0.78      0.75       143



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

# Identify the indices where predictions do not match actual labels
incorrect_indices = np.where(y_pred != y_test)[0]

# Print out the misclassified sentences along with predicted and actual labels
print(f"\nTotal incorrect predictions: {len(incorrect_indices)}")
for index in incorrect_indices:
    print(f"Sentence: '{responses_test.iloc[index]}'")
    print(f"Predicted: '{label_encoder.inverse_transform([y_pred[index]])[0]}', Actual: '{label_encoder.inverse_transform([y_test.iloc[index]])[0]}'\n")


## NPS wrong answers analyzing

In [49]:


label_encoder = LabelEncoder()
filtered_NPS_data['filtered_category_encoded'] = label_encoder.fit_transform(filtered_NPS_data['Grouped_Sub-driver'])
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
clean_vectors = vectorizer.fit_transform(filtered_NPS_data['CUSTOMER_RESPONSE'])
X_train, X_test, y_train, y_test, responses_train, responses_test = train_test_split(
    clean_vectors,
    filtered_NPS_data['filtered_category_encoded'],
    filtered_NPS_data['CUSTOMER_RESPONSE'],
    test_size=0.2,
    random_state=42
)

X_train2, X_test2, y_train2, y_test2, responses_train2, responses_test2 = train_test_split(
    X_train,
    y_train,
    responses_train,
    test_size=0.2,
    random_state=42
)

dtrain = DMatrix(X_train2, label=y_train2)
dtest = DMatrix(X_test2, label=y_test2)
evallist = [(dtrain, 'train'), (dtest, 'eval')]

# Training the XGBoost model
trained_model = xgb.train(
    param_grid,
    dtrain,
    evals=evallist,
    num_boost_round=2000,
    early_stopping_rounds=10
)

# Calculate predictions for the test set
y_pred = trained_model.predict(DMatrix(X_test)).argmax(axis=1)

# Print classification report
print("XGBoost Classification Report with Evaluation Set:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Identify the indices where predictions do not match actual labels
incorrect_indices = np.where(y_pred != y_test)[0]

# Print out the misclassified sentences along with predicted and actual labels
print(f"\nTotal incorrect predictions: {len(incorrect_indices)}")
for index in incorrect_indices:
    print(f"Sentence: '{responses_test.iloc[index]}'")
    print(f"Predicted: '{label_encoder.inverse_transform([y_pred[index]])[0]}', Actual: '{label_encoder.inverse_transform([y_test.iloc[index]])[0]}'\n")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_NPS_data['filtered_category_encoded'] = label_encoder.fit_transform(filtered_NPS_data['Grouped_Sub-driver'])


[0]	train-mlogloss:1.92758	eval-mlogloss:1.92826
[1]	train-mlogloss:1.90983	eval-mlogloss:1.91130
[2]	train-mlogloss:1.89684	eval-mlogloss:1.89980
[3]	train-mlogloss:1.88176	eval-mlogloss:1.88588
[4]	train-mlogloss:1.86562	eval-mlogloss:1.87170
[5]	train-mlogloss:1.85123	eval-mlogloss:1.85948
[6]	train-mlogloss:1.83623	eval-mlogloss:1.84531
[7]	train-mlogloss:1.82182	eval-mlogloss:1.83229
[8]	train-mlogloss:1.80701	eval-mlogloss:1.81846
[9]	train-mlogloss:1.79421	eval-mlogloss:1.80807
[10]	train-mlogloss:1.78231	eval-mlogloss:1.79802
[11]	train-mlogloss:1.76754	eval-mlogloss:1.78423
[12]	train-mlogloss:1.75517	eval-mlogloss:1.77305
[13]	train-mlogloss:1.74079	eval-mlogloss:1.76009
[14]	train-mlogloss:1.72933	eval-mlogloss:1.75092
[15]	train-mlogloss:1.71601	eval-mlogloss:1.73889
[16]	train-mlogloss:1.70357	eval-mlogloss:1.72714
[17]	train-mlogloss:1.69121	eval-mlogloss:1.71568
[18]	train-mlogloss:1.67895	eval-mlogloss:1.70412
[19]	train-mlogloss:1.66695	eval-mlogloss:1.69310
[20]	train

# Analyzing the wrong classied responses.

### Problem 1:  data falls between two categories.

It's mixed between both classes, thus the op should be the highest two categories.. not only the first.

In [None]:
# Sentence: 'I choose that because when my family go to hospital and see doctor and when prescription comes out, the hospital will start saying that HOM is not fully covered and sometimes low grade drugs were given to them'
# Predicted: 'medication', Actual: 'health benefits'


# Sentence: 'I’ve been an advocate of Reliance HMO, however over the last couple of visits to the hospital, the time it takes for your officers to respond nor provide authorization codes is quite alarming. We’ve had to spent close to an hour waiting for codes, even after several personal phone calls to the customer service personnel.

# In addition to the above, there has been issues regarding reliance HMo and the hospital negotiating over the price of drugs, which has seen us spend about 50mins waiting for the needless negotiations putting into consideration that the patient was in a dire situation. On one of those occasions we had to pay out of pocket to get the drugs sorted.

# Furthermore, IT glitches on your mobile app, this app was one of the season why I convinced my organization to join amongst other benefits, however the recent glitches is becoming one too many which has affected the way I view your company.'
# Predicted: 'medication', Actual: 'feedback'

# Sentence: '* Late response from your Doctor and support agent
# * Wrong prisciption of drugs
# * Long delay in medication pick up sms'
# Predicted: 'Responsivenss', Actual: 'waiting time'

# Sentence: 'Bcs I went to the hospital 2 days ago, and you promise to get back to me on the prescription prescribed to me by the doctor.till now, hv not gotten any information about it.

# Thank you'
# Predicted: 'waiting time', Actual: 'feedback'

### Problem 2: incorrectly labelled data.

In [None]:

# Sentence: 'Quick service received since I have been using Reliance HMO'
# Predicted: 'Responsivenss', Actual: 'pa code'

# Sentence: 'My first experience with your HMO wasn't encouraging. I picked a Pharmacy to pick-up my prescribed medication. On getting there, I was told they had informed the HMO that the drugs were not available but could be made available before my arrival if only they sent the amount. Up to the closing time of the day, the pharmacy was not contacted again, meanwhile I was sent a message to pick up the drugs there. I was very embarrassing that day.
# At another time, I was told my drug would be delivered to me at my work station, I waited for more than 24hours before getting it after engaging the chat room over and over again.

# As at this moment, the drug prescription by a doctor yesterday which I requested before 12 noon has not yet been delivered to me till now.

# You can verify all my claims please'
# Predicted: 'feedback', Actual: 'medication'


# Sentence: 'i dont know if it applies to my package only but multi-vitamins are not been giving.also, delay in granting authorization'
# Predicted: 'medication', Actual: 'pa code'

# Sentence: 'The response to the Drs or the hospital is not as faster as to the need of a patient,also response and delivery of medications is not as prompt as expected'
# Predicted: 'Responsivenss', Actual: 'medication'



# Sentence: 'Slow and delayed response, delay delivery'
# Predicted: 'Responsivenss', Actual: 'medication'

# Sentence: 'you are bad at your service for your client who requested for a dermatologist got prescription without knowing how critical it took one month no response till date.'
# Predicted: 'Responsivenss', Actual: 'feedback'

# Sentence: 'never had any issues at the hospital and response time is super fast'
# Predicted: 'Responsivenss', Actual: 'pa code'

# Sentence: 'The response time on consult a doctor on the app or web takes a long time waiting for the doctor'
# Predicted: 'Responsivenss', Actual: 'waiting time'

# Sentence: 'The delay in response'
# Predicted: 'Responsivenss', Actual: 'medication'

# Sentence: 'The customer service is quite slow in terms of responding to enquries.I have said I wanted to go to a ear doctor but nothing done yet done'
# Predicted: 'waiting time', Actual: 'feedback'

# Sentence: 'deliver was slow'
# Predicted: 'Responsivenss', Actual: 'medication'



# Sentence: 'Because a times your doctors are not patience enough to listen and show Empathy,rather some can't wait to end the session with you,and also how come you don't prescribe sex enhancing drugs for your clients with such problem'
# Predicted: 'medication', Actual: 'waiting time'


# Sentence: 'Consultation service was on fast and drug pick was fast too'
# Predicted: 'Responsivenss', Actual: 'pa code'


# Sentence: 'Very slow response time'
# Predicted: 'Responsivenss', Actual: 'feedback'


### Problem 3: Bad prediction


In [None]:
# Sentence: 'Because your responding is too slow'
# Predicted: 'feedback', Actual: 'Responsivenss'

# Sentence: 'Because I have not gotten feed back based on my mail yesterday'
# Predicted: 'medication', Actual: 'feedback'

# Sentence: 'Your consultant online doctors are not nice nor efficient in their service delivery . The response time before they begin to chat with you is too long a time.
# Then also , it’s takes too long a time to deliver drugs . I didn’t get a drug until after 2 or 3 days of request ( not over the weekend o)'
# Predicted: 'medication', Actual: 'waiting time'



# Sentence: 'I like the service, but was disappointed when the clinic R.Jolad could not give 500mg. Vit. C because HMO Policy.'
# Predicted: 'medication', Actual: 'health benefits'

# Sentence: 'Delays sometimes. Not sending the code early enough.'
# Predicted: 'medication', Actual: 'pa code'


# Sentence: 'because i have been on an onboarding process with  a gym which you claim could take up to 3 months but its about 6 months now with no  result. you guys are just tossing me back and forth whenever i ask for update.'
# Predicted: 'pa code', Actual: 'feedback'

# All basic care for children has been deleted from the benefits.'
# Predicted: 'Responsivenss', Actual: 'health benefits'

# Sentence: 'Staffs are accommodating and always willing to help. Customer service and follow up is good as well'
# Predicted: 'Responsivenss', Actual: 'staff attitude'

# Sentence: 'I chose that because when something is good you comment that is good. reliance hmo is one of the best HMO so far, when you call they answer as fast as possible, so i want to use this opportunity to say a very big thank you to all the staffs of reliance hmo thanks and God bless you all amen.'
# Predicted: 'health benefits', Actual: 'staff attitude'


# Sentence: 'My son was ill and ended up paying all the bills'
# Predicted: 'medication', Actual: 'health benefits'

# Sentence: 'I was recommended for a PTA and tympomometry test. Shockingly i was told this was not covered by Reliance.'
# Predicted: 'medication', Actual: 'health benefits'

# Sentence: 'The experience I had with you guys when I was admitted was terrible. I was in excruciating pain and for four days Reliance did not approve Oral Morphine drug for the hospital to manage the postop pain I was having. I had to resort to verbally abusing your customer care representative which was unlike me.'
# Predicted: 'feedback', Actual: 'health benefits'


# Sentence: 'I CHOSE IT BECAUSE ANY TIME I GO FOR TREATMENT OR TAKE ANY OF MY FAMILY THERE FOR TREATMENT, THE RECEPTION IS ALWAYS GREAT AND THEY ARE ALWAYS QUICK TO ATTEND TO US WITHOUT ANY DELAY.'
# Predicted: 'health benefits', Actual: 'staff attitude'

# Sentence: 'Your customer service and feedback/update is poor'
# Predicted: 'Responsivenss', Actual: 'feedback'

# Sentence: 'Please do include dental and eye care into your area of concern.'
# Predicted: 'medication', Actual: 'health benefits'


# Sentence: 'Just refund me my money for the MMR covered by my organization’s package'
# Predicted: 'health benefits', Actual: 'feedback'

# Sentence: 'Efficiency and the timely responses of the customer care staff.  Also available of wide range of hospital for customers access'
# Predicted: 'medication', Actual: 'Responsivenss'


# Sentence: 'The team doesn't have enough secondary health care providers. The sole focus is on medical issues that I really do not need. Your facilities provide low quality eye glasses and might need to pay as much as 50K in addition to get what suits you.
# What I'm about to say now is outside the scope of the services you provide, but I would love if  you could partner up with sport facilities like swimming centres not just gyms.

# Thank you.
# My regards'
# Predicted: 'medication', Actual: 'health benefits'

### Problem 4: bad prediction and bad labelling!

In [None]:
# Sentence: 'I am really not happy today, because I spoke to an online Doctor to refer me to another hospital for my test instead of Jolad but no response from any of the Doctor I chatted online. I went to R. Jolad hospital today since 9:00 am from my office and they took almost all my time and yet no drugs was given to me because of the delay in response with the approval. I had to get back to work by 4:00pm, I am not just happy. I am not happy, just know it.'
# Predicted: 'waiting time', Actual: 'pa code'

# Sentence: 'The time use in dispensing drugs is too long.
# The drugs some pharmacy do give a times is not encouraging'
# Predicted: 'medication', Actual: 'pa code'

# Sentence: 'Because of the service responses and delivery'
# Predicted: 'medication', Actual: 'Responsivenss'

# Sentence: 'You don’t have good hospitals, spa or gym in my location. I have sent names of the ones close by, none have been onboarded.'
# Predicted: 'medication', Actual: 'pa code'

# Sentence: 'Your parameters for determining what tests you pay for are ridiculous. If a Doctor from a hospital in your database prescribes a test, why would you say the diagnosis and the test do not match? Are you saying you don't trust the Doctor? If you don't, why then would you allow me to be treated by that Doctor? It just shows me that you don't really care about my well-being. You guys just do lip service. Ridiculous.


# Another thing is that your pharmacy operations are slow and unreliable. Sometimes, you don't have the drug. Other times, you deliver four days after the drug was prescribed. It's like your customers' health is a joke to you. Just pay the hospital to get the drugs for patients immediately. But no, you won't do that. Only God knows why. Again, ridiculous.

# My hope is to be free from your service soon.'
# Predicted: 'waiting time', Actual: 'health benefits'

# Sentence: 'I recently had an unsatisfactory experience with my health insurance provider that compelled me to share my concerns. The overall service has been subpar, particularly in terms of medication delivery.One of the major grievances I encountered was the consistent delay in receiving my prescribed medications. Despite assurances of timely delivery, the reality fell far short of expectations. On multiple occasions, I found myself waiting for crucial medications, causing unnecessary stress and disruptions to my health management.Moreover, the customer service offered by the insurance company was disappointingly inadequate. Attempts to address these delays and seek clarification were met with generic responses and a lack of proactive communication. It became evident that the company was not prioritizing customer satisfaction or the urgency of healthcare needs.'
# Predicted: 'medication', Actual: 'pa code'


## NER_TUNING

In [51]:
nlp = spacy.load("en_core_web_sm")

In [53]:
LABELS = ["feedback", "waiting time", "Responsiveness", "medication", "health benefits", "staff attitude", "pa code"]
TRAIN_DATA = [
    ("Because your responding is too slow", {"entities": [(0, 7, "Responsivenss")]}),
    ("Because I have not gotten feed back based on my mail yesterday", {"entities": [(34, 41, "feedback")]}),
    ("Your consultant online doctors are not nice nor efficient in their service delivery . The response time before they begin to chat with you is too long a time. Then also , it’s takes too long a time to deliver drugs . I didn’t get a drug until after 2 or 3 days of request ( not over the weekend o)", {"entities": [(199, 210, "waiting time"), (246, 254, "medication"), (48, 62, "Responsivenss"), (139, 146, "Responsivenss")]}),
    ("I like the service, but was disappointed when the clinic R.Jolad could not give 500mg. Vit. C because HMO Policy.", {"entities": [(0, 43, "medication"), (73, 85, "health benefits")]}),
    ("Delays sometimes. Not sending the code early enough.", {"entities": [(0, 6, "medication"), (15, 38, "pa code")]}),
    ("because i have been on an onboarding process with  a gym which you claim could take up to 3 months but its about 6 months now with no  result. you guys are just tossing me back and forth whenever i ask for update.", {"entities": [(87, 95, "feedback"), (50, 57, "pa code")]}),
    ("All basic care for children has been deleted from the benefits.", {"entities": [(0, 12, "Responsivenss"), (35, 49, "health benefits")]}),
    ("Staffs are accommodating and always willing to help. Customer service and follow up is good as well", {"entities": [(0, 13, "staff attitude"), (59, 75, "Responsivenss")]}),
    ("I chose that because when something is good you comment that is good. reliance hmo is one of the best HMO so far, when you call they answer as fast as possible, so i want to use this opportunity to say a very big thank you to all the staffs of reliance hmo thanks and God bless you all amen.", {"entities": [(0, 13, "staff attitude"), (135, 148, "health benefits")]}),
    ("My son was ill and ended up paying all the bills", {"entities": [(0, 19, "medication"), (44, 59, "health benefits")]}),
    ("I was recommended for a PTA and tympomometry test. Shockingly i was told this was not covered by Reliance.", {"entities": [(25, 43, "medication"), (92, 104, "health benefits")]}),
    ("The experience I had with you guys when I was admitted was terrible. I was in excruciating pain and for four days Reliance did not approve Oral Morphine drug for the hospital to manage the postop pain I was having. I had to resort to verbally abusing your customer care representative which was unlike me.", {"entities": [(105, 112, "feedback"), (147, 160, "health benefits")]}),
    ("I CHOSE IT BECAUSE ANY TIME I GO FOR TREATMENT OR TAKE ANY OF MY FAMILY THERE FOR TREATMENT, THE RECEPTION IS ALWAYS GREAT AND THEY ARE ALWAYS QUICK TO ATTEND TO US WITHOUT ANY DELAY.", {"entities": [(112, 122, "staff attitude"), (146, 160, "health benefits")]}),
    ("Your customer service and feedback/update is poor", {"entities": [(5, 22, "Responsivenss"), (27, 34, "feedback")]}),
    ("Please do include dental and eye care into your area of concern.", {"entities": [(22, 29, "medication"), (0, 21, "health benefits")]}),
    ("Just refund me my money for the MMR covered by my organization’s package", {"entities": [(0, 17, "health benefits"), (25, 41, "feedback")]}),
    ("Efficiency and the timely responses of the customer care staff. Also available of wide range of hospital for customers access", {"entities": [(0, 11, "medication"), (49, 65, "Responsivenss")]}),
    ("The team doesn't have enough secondary health care providers. The sole focus is on medical issues that I really do not need. Your facilities provide low quality eye glasses and might need to pay as much as 50K in addition to get what suits you. What I'm about to say now is outside the scope of the services you provide, but I would love if  you could partner up with sport facilities like swimming centres not just gyms. Thank you. My regards", {"entities": [(97, 111, "medication"), (0, 60, "health benefits")]}),
]

In [55]:

db = DocBin() # create a DocBin object

for text, annot in tqdm(TRAIN_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

100%|██████████| 18/18 [00:00<00:00, 1455.43it/s]

Skipping entity
Skipping entity
Skipping entity





In [56]:
db.to_disk("./train.spacy") # save the docbin object

In [58]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [59]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     49.80    4.72    3.19    9.09    0.05
 33     200        472.79   2199.61   98.46  100.00   96.97    0.98
 74     400          0.34      2.36   98.46  100.00   96.97    0.98
124     600          0.00      0.00   98.46  100.00   96.97    0.98
187     800          0.00      0.00   98.46  100.00   96.97    0.98
262    1000          0.00      0.00   98.46  100.00   96.97    0.98
362    1200          0.00      0.00   98.46  100.00   96.97    0.98
462    1400          0.00      0.00   98.46  100.00   96.97    0.98
613    1600          0.00      0.00   98.46  100.00   96.97    0.9

In [61]:
test_data = [
    ("Your responding is too slow", {"entities": [(5, 14, "Responsiveness")]}),
    ("I haven't received any feedback yet.", {"entities": [(19, 27, "Feedback")]}),
    ("The waiting time for appointments is excessive.", {"entities": [(4, 15, "Waiting_Time")]}),
    ("The medication prescribed was ineffective.", {"entities": [(4, 14, "Medication")]}),
    ("The health benefits provided by the plan are comprehensive.", {"entities": [(4, 17, "Health_Benefits")]}),
    ("The staff attitude was very welcoming.", {"entities": [(4, 16, "Staff_Attitude")]}),
    ("The PA code for the appointment is XYZ123.", {"entities": [(4, 11, "PA_Code")]}),
    ("The responsiveness of the customer service team was commendable.", {"entities": [(4, 17, "Responsiveness")]}),
    ("The staff were attentive and caring.", {"entities": [(4, 8, "Staff_Attitude")]}),
    ("The medication provided did not address the issue.", {"entities": [(4, 14, "Medication")]}),
    ("I appreciate the comprehensive health benefits offered.", {"entities": [(23, 36, "Health_Benefits")]}),
    ("The waiting time at the clinic was minimal.", {"entities": [(4, 15, "Waiting_Time")]}),
    ("The feedback received was constructive.", {"entities": [(4, 11, "Feedback")]}),
    ("The PA code for the appointment was provided promptly.", {"entities": [(4, 11, "PA_Code")]}),
    ("The responsiveness of the support team was impressive.", {"entities": [(4, 17, "Responsiveness")]}),
    ("The staff demonstrated excellent attitude towards patients.", {"entities": [(4, 18, "Staff_Attitude")]}),
    ("The medication prescribed was effective in alleviating symptoms.", {"entities": [(4, 14, "Medication")]}),
    ("I'm satisfied with the health benefits provided by the insurance plan.", {"entities": [(16, 29, "Health_Benefits")]}),
    ("The waiting time for appointments needs improvement.", {"entities": [(4, 15, "Waiting_Time")]}),
    ("I appreciate the prompt feedback on my query.", {"entities": [(23, 30, "Feedback")]}),
    ("The PA code for the appointment was not provided.", {"entities": [(4, 11, "PA_Code")]}),
    ("The responsiveness of the team needs to be enhanced.", {"entities": [(4, 17, "Responsiveness")]}),
    ("The staff exhibited a positive attitude towards patients.", {"entities": [(4, 18, "Staff_Attitude")]}),
    ("The prescribed medication had adverse side effects.", {"entities": [(4, 14, "Medication")]}),
    ("The health benefits offered by the plan are inadequate.", {"entities": [(4, 17, "Health_Benefits")]}),
    ("The waiting time at the hospital was excessive.", {"entities": [(4, 15, "Waiting_Time")]}),
    ("The feedback provided was helpful in improving the service.", {"entities": [(4, 11, "Feedback")]}),
    ("The PA code for the appointment was incorrect.", {"entities": [(4, 11, "PA_Code")]}),
    ("The responsiveness of the team was unsatisfactory.", {"entities": [(4, 17, "Responsiveness")]}),
    ("The staff need to improve their attitude towards patients.", {"entities": [(4, 18, "Staff_Attitude")]}),
    ("The prescribed medication did not address the issue effectively.", {"entities": [(4, 14, "Medication")]}),
    ("The health benefits provided by the plan are satisfactory.", {"entities": [(4, 17, "Health_Benefits")]}),
    ("The waiting time for appointments is reasonable.", {"entities": [(4, 15, "Waiting_Time")]}),
    ("I received prompt feedback on my query.", {"entities": [(16, 23, "Feedback")]}),
    ("The PA code for the appointment was provided without delay.", {"entities": [(4, 11, "PA_Code")]}),
    ("The team's responsiveness needs improvement.", {"entities": [(6, 19, "Responsiveness")]}),
    ("The staff's attitude towards patients needs to be more welcoming.", {"entities": [(6, 18, "Staff_Attitude")]}),
    ("The prescribed medication caused allergic reactions.", {"entities": [(4, 14, "Medication")]}),
    ("The health benefits provided by the plan are subpar.", {"entities": [(4, 17, "Health_Benefits")]}),
    ("The waiting time for appointments is too long.", {"entities": [(4, 15, "Waiting_Time")]}),
    ("The feedback received was valuable.", {"entities": [(4, 11, "Feedback")]}),
    ("The PA code for the appointment was not provided on time.", {"entities": [(4, 11, "PA_Code")]}),
    ("The responsiveness of the team needs urgent attention.", {"entities": [(4, 17, "Responsiveness")]}),
    ("The staff's attitude towards patients was unacceptable.", {"entities": [(6, 18, "Staff_Attitude")]}),
    ("The prescribed medication did not produce the desired results.", {"entities": [(4, 14, "Medication")]}),
    ("The health benefits provided by the plan are excellent.", {"entities": [(4, 17, "Health_Benefits")]}),
    ("The waiting time for appointments is too short.", {"entities": [(4, 15, "Waiting_Time")]}),
    ("I received timely feedback on my complaint.", {"entities": [(16, 23, "Feedback")]}),
    ("The PA code for the appointment was provided promptly.", {"entities": [(4, 11, "PA_Code")]}),
    ("The team's responsiveness exceeded my expectations.", {"entities": [(6, 19, "Responsiveness")]}),
    ("The staff's attitude towards patients was commendable.", {"entities": [(6, 18, "Staff_Attitude")]}),
    ("The prescribed medication effectively treated the condition.", {"entities": [(4, 14, "Medication")]})
]


In [62]:
test_texts = [data[0] for data in test_data]


In [63]:
nlp = spacy.load("/content/output/model-best")

# Function to extract entities from text
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Compare extracted entities with actual entities
for text, annotations in test_data:
    print("Text:", text)
    print("Actual Entities:", annotations['entities'])
    extracted_entities = extract_entities(text)
    print("Extracted Entities:", extracted_entities)
    print()


Text: Your responding is too slow
Actual Entities: [(5, 14, 'Responsiveness')]
Extracted Entities: []

Text: I haven't received any feedback yet.
Actual Entities: [(19, 27, 'Feedback')]
Extracted Entities: []

Text: The waiting time for appointments is excessive.
Actual Entities: [(4, 15, 'Waiting_Time')]
Extracted Entities: []

Text: The medication prescribed was ineffective.
Actual Entities: [(4, 14, 'Medication')]
Extracted Entities: [('The medication', 'health benefits')]

Text: The health benefits provided by the plan are comprehensive.
Actual Entities: [(4, 17, 'Health_Benefits')]
Extracted Entities: [('The health benefits', 'health benefits')]

Text: The staff attitude was very welcoming.
Actual Entities: [(4, 16, 'Staff_Attitude')]
Extracted Entities: [('The staff attitude', 'health benefits')]

Text: The PA code for the appointment is XYZ123.
Actual Entities: [(4, 11, 'PA_Code')]
Extracted Entities: [('The PA code for', 'health benefits')]

Text: The responsiveness of the cust