In [1]:
# SVM TRAINING

In [1]:
import pandas as pd
import os
import numpy as np

import nltk.data
from nltk.tokenize import word_tokenize
import sklearn
import unidecode
import re
from gensim.models.doc2vec import TaggedDocument
from itertools import islice
from gensim.models.doc2vec import Doc2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import os
from scipy import stats


import gensim
from gensim.models import Doc2Vec

import joblib
from sklearn.metrics import roc_auc_score, roc_curve

import scipy.stats

from sklearn import svm
import joblib
from sklearn.model_selection import GridSearchCV, GroupKFold

from nltk.corpus.reader import conll

In [2]:
# Define tokenization procedure
sent_tokenizer = nltk.data.load("nltk_data/tokenizers/punkt/dutch.pickle")

def tokenize(text):
    for sentence in sent_tokenizer.tokenize(text):
        yield word_tokenize(sentence)

# Read stopwords
with open('nltk_data/corpora/stopwords/dutch') as f:
    dutch_stopwords = set(f.read().splitlines())

# Initialize stemmer (using package nltk)
stemmer = nltk.stem.snowball.DutchStemmer()

In [3]:
#################################FUNCTIONS ####################################################################
# Preprocessing for text
def text_to_words(text, filter_stopwords=True, stemming=False, filter_periods=False):

    # Lowercase and remove special characters (ë => e, etc)
    text = text.lower()
    text = unidecode.unidecode(text)

    # Remove all non space, period, lowercase
    text = re.sub(r'([^\sa-z\.]|_)+', ' ', text)

    # Remove obsolete periods
    text = re.sub(r'\s\.\s', ' ', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub('\t', ' ', text)
    text = re.sub(r' +', ' ', text)

    # Tokenize
    words = [word for sentence in tokenize(text) for word in sentence]

    # Filter stopwords
    if filter_stopwords:
        words = [word for word in words if word not in dutch_stopwords]

    # Stemming
    if stemming:
        words = [stemmer.stem(w) for w in words]

    # Filter periods
    if filter_periods:
        words = [word for word in words if word != "."]

    # Return
    return words

######################################INFERENCE ##############################################
# Convert a dataframe with texts in the 'text_column' column to a numpy array with vector representations,
# based on a paragraph2vec_model and a specified number of repetitions.
def text_to_vectors(notes_df, text_column, paragraph2vec_model, no_reps=10):

    # Output is a matrix with rows equal to number of notes, and columns equal to paragraph2vec model size
    note_vectors = np.zeros((len(notes_df), paragraph2vec_model.vector_size))

    # Iterate over all notes
    for i in notes_df.index:

        # Words are in the 'text_preprocessed' column split by whitespaces
        note_words = notes_df.loc[i, text_column].split(" ")

        # Initialize an empty vector of length paragraph2vec model size
        note_vec = np.zeros((paragraph2vec_model.vector_size))

        # Iterate over number of repetitions to cancel out inaccuracies
        for _ in range(no_reps):
            note_vec += paragraph2vec_model.infer_vector(note_words)

        # Add to note_vectors after normalizing for number of repetitions
        note_vectors[i] = (note_vec / no_reps)

    # Return output
    return note_vectors

In [4]:
paragraph2vec_model = Doc2Vec.load("models/paragraph2vec_model_all")

TypeError: __randomstate_ctor() takes from 0 to 1 positional arguments but 2 were given

In [None]:
engineered_notes_train = pd.read_csv('hpc_space/MBERT5/train_data_augmented_5.csv')

# Convert text to words
engineered_notes_train['words_stemmed'] = engineered_notes_train['text'].apply(lambda x : text_to_words(x,
                                                                                    filter_stopwords=True,
                                                                                    stemming=True,
                                                                                    filter_periods=True
                                                                                    ))

# Join with whitespace
engineered_notes_train['words_stemmed'] = engineered_notes_train['words_stemmed'].apply(lambda x : ' '.join(x))

# Convert text to notes
note_vectors = text_to_vectors(engineered_notes_train, 'words_stemmed', paragraph2vec_model, 10)

# Concatenate to original dataframe
engineered_notes_train = pd.concat([engineered_notes_train, pd.DataFrame(note_vectors)], axis=1)

In [7]:
engineered_notes_train.to_csv("engineered_notes_train_augmented_5.csv")

In [8]:
engineered_notes_train = pd.read_csv("engineered_notes_train_augmented_5.csv")
engineered_notes_test = pd.read_csv("engineered_notes_test_all_trained_5.csv")

In [9]:
# Subset data from engineered_notes_train for training
X_train = engineered_notes_train[[str(a) for a in range(300)]].values
y_train = engineered_notes_train[['outcome', 'Geslacht']]

# Subset data from engineered_notes_test for testing
X_test = engineered_notes_test[[str(a) for a in range(300)]].values
y_test = engineered_notes_test[['outcome', 'Geslacht']]

In [10]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

svm_params = [{
    'clf__C': [0.1, 1, 10],
    'clf__kernel': ['rbf'],

}]

pipeline = Pipeline(steps=[("preprocesser", StandardScaler()), ("clf", SVC(random_state=0, probability=True))])
grid = GridSearchCV(pipeline, svm_params, scoring='f1_macro', verbose=1, cv=3)

grid.fit(X_train, y_train['outcome'])

svm_model = grid.best_estimator_

Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [11]:
# Store predictions and true labels in a DataFrame for testing data
predictions = pd.DataFrame({
    'probability': svm_model.predict_proba(X_test)[:, 1],
    'pred_label': svm_model.predict(X_test),
    'true_label': y_test['outcome'],
    'Geslacht': y_test['Geslacht'],
    'fold_number': 1
})

predictions.to_csv(f"predictions_augmentedtrain_5.csv", sep=";", index=False)

model_filename = f"augmentedtrain_5.joblib"
joblib.dump(svm_model, model_filename)

['augmentedtrain_5.joblib']

In [12]:
final_df = pd.read_csv("predictions_augmentedtrain_5.csv", sep=";")

In [13]:
# EVALUATION

In [14]:
final_df = final_df.rename(columns={"pred_label": "final_prediction"})
final_df = final_df.rename(columns={"true_label": "label"})

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Calculate accuracy for the positive class (label = 1)
accuracy_positive = accuracy_score(final_df['label'], final_df['final_prediction'])

# Calculate precision for the positive class (label = 1)
precision_positive = precision_score(final_df['label'], final_df['final_prediction'], pos_label=1)

# Calculate recall for the positive class (label = 1)
recall_positive = recall_score(final_df['label'], final_df['final_prediction'], pos_label=1)

# Calculate F1 score for the positive class (label = 1)
f1_positive = f1_score(final_df['label'], final_df['final_prediction'], pos_label=1)

# Calculate precision for the negative class (label = 0)
precision_negative = precision_score(final_df['label'], final_df['final_prediction'], pos_label=0)

# Calculate recall for the negative class (label = 0)
recall_negative = recall_score(final_df['label'], final_df['final_prediction'], pos_label=0)

# Calculate F1 score for the negative class (label = 0)
f1_negative = f1_score(final_df['label'], final_df['final_prediction'], pos_label=0)

# Calculate the AUC
auc = roc_auc_score(final_df['label'], final_df['final_prediction'])

# Calculate the AUC for males (Geslacht = 1)
auc_male = roc_auc_score(final_df[final_df['Geslacht'] == 1]['label'], final_df[final_df['Geslacht'] == 1]['final_prediction'])

# Calculate the AUC for females (Geslacht = 0)
auc_female = roc_auc_score(final_df[final_df['Geslacht'] == 0]['label'], final_df[final_df['Geslacht'] == 0]['final_prediction'])


# Print the calculated metrics separately for both classes
print(f"Accuracy (Overall): {accuracy_positive:.4f}")
print(f"Precision (Positive): {precision_positive:.4f}")
print(f"Recall (Positive): {recall_positive:.4f}")
print(f"F1 Score (Positive): {f1_positive:.4f}")
print(f"Precision (Negative): {precision_negative:.4f}")
print(f"Recall (Negative): {recall_negative:.4f}")
print(f"F1 Score (Negative): {f1_negative:.4f}")
print(f"AUC: {auc:.4f}")
print(f"AUC (Male): {auc_male:.4f}")
print(f"AUC (Female): {auc_female:.4f}")

Accuracy (Overall): 0.9446
Precision (Positive): 0.8947
Recall (Positive): 0.4789
F1 Score (Positive): 0.6239
Precision (Negative): 0.9473
Recall (Negative): 0.9940
F1 Score (Negative): 0.9701
AUC: 0.7364
AUC (Male): 0.7509
AUC (Female): 0.6759


In [16]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix for the entire dataset
cm = confusion_matrix(final_df['label'], final_df['final_prediction'])

# Calculate TPR and FPR for male (Geslacht = 1)
male_indices = final_df['Geslacht'] == 1
cm_male = confusion_matrix(final_df[male_indices]['label'], final_df[male_indices]['final_prediction'])

tpr_male = cm_male[1, 1] / (cm_male[1, 0] + cm_male[1, 1])
fpr_male = cm_male[0, 1] / (cm_male[0, 0] + cm_male[0, 1])

# Calculate TPR and FPR for female (Geslacht = 0)
female_indices = final_df['Geslacht'] == 0
cm_female = confusion_matrix(final_df[female_indices]['label'], final_df[female_indices]['final_prediction'])

tpr_female = cm_female[1, 1] / (cm_female[1, 0] + cm_female[1, 1])
fpr_female = cm_female[0, 1] / (cm_female[0, 0] + cm_female[0, 1])

# Print the calculated metrics separately for both classes
print(f"TPR (Male): {tpr_male:.4f}")
print(f"TPR (Female): {tpr_female:.4f}")
print(f"FPR (Male): {fpr_male:.4f}")
print(f"FPR (Female): {fpr_female:.4f}")

TPR (Male): 0.5088
TPR (Female): 0.3571
FPR (Male): 0.0069
FPR (Female): 0.0052


In [17]:
final_df['combined'] = final_df['Geslacht'].astype(str) + '_' + final_df['label'].astype(str) + '_' + final_df['final_prediction'].astype(str)

# Get the count of combinations
combination_counts = final_df['combined'].value_counts()

# Print the counts
print("Combined Counts:")
print(combination_counts)

Combined Counts:
combined
0_0_0    379
1_0_0    286
1_1_1     29
1_1_0     28
0_1_0      9
0_1_1      5
1_0_1      2
0_0_1      2
Name: count, dtype: int64
