In [1]:
from sentence_transformers import SentenceTransformer


In [2]:
### Load all relevant libraries

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.model_selection import KFold
from iteration_utilities import deepflatten

In [3]:
### Load the sentence transformer model that transforms text into a numerical representation 
sbert_model = SentenceTransformer('nli-bert-large-max-pooling')


In [4]:
### Load data labelled for training
# data_positive=pd.read_csv('/Users/johanna/Library/CloudStorage/GoogleDrive-johanna@limbic.ai/Shared drives/05_Research/Data/Clinical/IncreasedAccess/Access_feedback_labelled_by_Johanna/Themed_Patient Feedback Comments_mixedClinics - Helpful.csv')
# data_neutral=pd.read_csv("/Users/johanna/Library/CloudStorage/GoogleDrive-johanna@limbic.ai/Shared drives/05_Research/Data/Clinical/IncreasedAccess/Access_feedback_labelled_by_Johanna/Themed_Patient Feedback Comments_mixedClinics - Needed More.csv")
# data_negative=pd.read_csv("/Users/johanna/Library/CloudStorage/GoogleDrive-johanna@limbic.ai/Shared drives/05_Research/Data/Clinical/IncreasedAccess/Access_feedback_labelled_by_Johanna/Themed_Patient Feedback Comments_mixedClinics - Not Helpful.csv")

# this is not made available due to privacy and consent of the participants!

# here a dataset with positive feedback is loaded
data_positive=data_positive[data_positive['themes'].isna()==False]
data_positive.reset_index(inplace=True)
data_positive['feedback_type'] = 'positive'

# here a dataset with neutral feedback is loaded
data_neutral=data_neutral[data_neutral['themes'].isna()==False]
data_neutral.reset_index(inplace=True)
data_neutral['themes'][data_neutral['themes']=="undefined"]="neutral undefined"
data_neutral['feedback_type'] = 'neutral'

# here a dataset with negative feedback is loaded
data_negative=data_negative[data_negative['themes'].isna()==False]
data_negative.reset_index(inplace=True)
data_negative['themes'][data_negative['themes']=="undefined"]="negative undefined"
data_negative['feedback_type'] = 'negative'

data=pd.concat([data_positive, data_neutral,data_negative], axis=0)
data.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_neutral['themes'][data_neutral['themes']=="undefined"]="neutral undefined"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_negative['themes'][data_negative['themes']=="undefined"]="negative undefined"


In [5]:
### Check frequency of each label category
data['themes'].value_counts()

Help at the time of need that was given fast and easy to use                 139
A hopeful first-step to care that was thoughtfully designed                  121
Provided a self realisation of their current situation                        90
A friendly companion that removes anxiety of talking to humans                86
They felt that they were in crisis and needed immediate attention.            75
negative undefined                                                            70
As they needed to talk about specific illnesses or other mental disorders     61
neutral undefined                                                             59
As they expected to start therapy faster                                      51
Name: themes, dtype: int64

In [6]:
### Transform text into numerical representation (i.e. apply the sentence embedding)

vec_text=sbert_model.encode(data['body'], show_progress_bar=True)


Batches:   0%|          | 0/24 [00:00<?, ?it/s]

In [7]:
### Reduce dimensionality of sentence embedding (i.e. from 1024 to 50 dimensions)

pca = PCA(n_components=100, random_state=42)
pca_embeddings = pca.fit_transform(vec_text)


In [8]:
### add the feedback type to the input data

def one_hot_encode(data): 
    data=pd.get_dummies(data, columns=['feedback_type'])
    return data

data=one_hot_encode(data)
input_data=np.concatenate([pca_embeddings, data[['feedback_type_negative', 'feedback_type_neutral', 'feedback_type_positive']].values], axis=1)



In [9]:
### Map categorical labels (i.e. text) into numerical labels

mapper={a:i for i, a in enumerate(data['themes'].unique())}
labels=data['themes'].map(lambda x: mapper[x]).values


In [10]:
### Define the number of slpits for the crossvalidation (here 100)
kf=KFold(n_splits=100)

### Define the classification model used for the multi-class classification (here a simple logistic regression)
estimator = LogisticRegression(multi_class="multinomial", random_state=42, max_iter=1000)

y_pred=[]
ground_truth=[]
f1_score_fold=[]
### Split the data into training and testing data set
for i, (train_index, test_index) in enumerate(kf.split(input_data)):
    X_train, X_test = input_data[train_index], input_data[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    ### Fit the model 
    estimator.fit(X_train,y_train)

    ### get model prediction for the test data 
    y_hat=estimator.predict(X_test)

    ### Save data
    y_pred.append(y_hat)
    ground_truth.append(y_test)
    f1_score_fold.append(f1_score(y_test,y_hat, average='micro'))


### bring saved data into the right format
y_pred=np.array(list(deepflatten(y_pred)))
ground_truth=np.array(list(deepflatten(ground_truth)))
f1_score_fold=np.array(f1_score_fold)


In [11]:
### Evaluate model performance

print(f1_score_fold.mean()) # micro-average F1 score

print(f"F1-score: {f1_score(ground_truth,y_pred, average=None)}")
print(f"Recall-score: {recall_score(ground_truth,y_pred, average=None)}")
print(f"Precision-score: {precision_score(ground_truth,y_pred, average=None)}")


0.6623214285714285
F1-score: [0.65853659 0.67455621 0.74860335 0.61754386 0.57142857 0.72897196
 0.71666667 0.59090909 0.67532468]
Recall-score: [0.66942149 0.6627907  0.74444444 0.63309353 0.54237288 0.76470588
 0.70491803 0.55714286 0.69333333]
Precision-score: [0.648      0.68674699 0.75280899 0.60273973 0.60377358 0.69642857
 0.72881356 0.62903226 0.65822785]


In [12]:
### Fit final model based on all available data (this model us used for predicting labels in a new data set)

model_use = LogisticRegression(multi_class="multinomial", random_state=42, max_iter=1000)
model_use.fit(input_data,labels)


LogisticRegression(max_iter=1000, multi_class='multinomial', random_state=42)

## Load feedback data, transform text into numberical, pca, and then apply the model

In [15]:
### Load user feedback

# here a dataset with all feedback is loaded

df = pd.read_csv('/Users/johanna/Library/CloudStorage/GoogleDrive-johanna@limbic.ai/Shared drives/05_Research/Data/Clinical/IncreasedAccess/referral_feedback_from_database_16_03_2023.csv')

# change the feedback into positive, neutral and negative

mapper_feedback_type={'Yes': 'positive',
             'Needed more': 'neutral',
             "No":'negative'}

df['feedback_type']=df['isHelpful'].map(lambda x: mapper_feedback_type[x]).values

In [16]:
### Transform text into numerical representation (i.e. apply the sentence embedding)

vec_text_feedback=sbert_model.encode(df['feedback'], show_progress_bar=True)

Batches:   0%|          | 0/1443 [00:00<?, ?it/s]

In [17]:
### apply the same pca as before 

pca_embeddings_feedback = pca.transform(vec_text_feedback)


## add the feedback type
df=one_hot_encode(df)
input_data_real=np.concatenate([pca_embeddings_feedback, df[['feedback_type_negative', 'feedback_type_neutral', 'feedback_type_positive']].values], axis=1)


### predict label based on the input data

predicted_labels = model_use.predict(input_data_real)

df['label_nr'] = predicted_labels
theme_mapping = {index: name for index, name in enumerate(data['themes'].unique())}
df['label_name'] = df['label_nr'].map(lambda x: theme_mapping[x]).values

In [18]:
df['label_name'].value_counts()


Help at the time of need that was given fast and easy to use                 20733
A hopeful first-step to care that was thoughtfully designed                  11843
Provided a self realisation of their current situation                        4531
A friendly companion that removes anxiety of talking to humans                3832
neutral undefined                                                             1778
As they needed to talk about specific illnesses or other mental disorders     1705
As they expected to start therapy faster                                       674
They felt that they were in crisis and needed immediate attention.             623
negative undefined                                                             446
Name: label_name, dtype: int64

In [19]:
## save the dataset with labels

df.to_csv('/Users/johanna/Library/CloudStorage/GoogleDrive-johanna@limbic.ai/Shared drives/05_Research/Data/Clinical/IncreasedAccess/referral_feedback_from_database_16_03_2023_with_predicted_labels_new.csv')
