In [None]:
# DOC2VEC TRAINING

In [None]:
import pandas as pd

In [None]:
notes = pd.read_csv('hpc_space/Masterthesisdata/merged.csv')

In [None]:
import os

In [None]:
import numpy as np

import nltk.data
from nltk.tokenize import word_tokenize

import unidecode
import re

In [None]:
# nltk.download('punkt')

In [None]:
# Define tokenization procedure
sent_tokenizer = nltk.data.load("nltk_data/tokenizers/punkt/dutch.pickle")

def tokenize(text):
    for sentence in sent_tokenizer.tokenize(text):
        yield word_tokenize(sentence)

In [None]:
# nltk.download('stopwords')

In [None]:
# Read stopwords
with open('nltk_data/corpora/stopwords/dutch') as f:
    dutch_stopwords = set(f.read().splitlines())

In [None]:
# Initialize stemmer (using package nltk)
stemmer = nltk.stem.snowball.DutchStemmer()

In [None]:
# Preprocessing for text
def text_to_words(text, filter_stopwords=True, stemming=False, filter_periods=False):

    # Lowercase and remove special characters (ë => e, etc)
    text = text.lower()
    text = unidecode.unidecode(text)

    # Remove all non space, period, lowercase
    text = re.sub(r'([^\sa-z\.]|_)+', ' ', text)

    # Remove obsolete periods
    text = re.sub(r'\s\.\s', ' ', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub('\t', ' ', text)
    text = re.sub(r' +', ' ', text)

    # Tokenize
    words = [word for sentence in tokenize(text) for word in sentence]

    # Filter stopwords
    if filter_stopwords:
        words = [word for word in words if word not in dutch_stopwords]

    # Stemming
    if stemming:
        words = [stemmer.stem(w) for w in words]

    # Filter periods
    if filter_periods:
        words = [word for word in words if word != "."]

    # Return
    return words

In [None]:
# Convert a dataframe with texts in the 'text_column' column to a numpy array with vector representations,
# based on a paragraph2vec_model and a specified number of repetitions.
def text_to_vectors(notes_df, text_column, paragraph2vec_model, no_reps=10):

    # Output is a matrix with rows equal to number of notes, and columns equal to paragraph2vec model size
    note_vectors = np.zeros((len(notes_df), paragraph2vec_model.vector_size))

    # Iterate over all notes
    for i in notes_df.index:

        # Words are in the 'text_preprocessed' column split by whitespaces
        note_words = notes_df.loc[i, text_column].split(" ")

        # Initialize an empty vector of length paragraph2vec model size
        note_vec = np.zeros((paragraph2vec_model.vector_size))

        # Iterate over number of repetitions to cancel out inaccuracies
        for _ in range(no_reps):
            note_vec += paragraph2vec_model.infer_vector(note_words)

        # Add to note_vectors after normalizing for number of repetitions
        note_vectors[i] = (note_vec / no_reps)

    # Return output
    return note_vectors

In [None]:
# Processed notes and labels are written to two seperate files
notes_file_path = os.path.join('hpc_space', 'Masterthesisdata', 'processed_notes', 'notesAll.txt')
label_file_path = os.path.join('hpc_space', 'Masterthesisdata', 'processed_notes', 'labelsAll.txt')

# Open file handles for preprocessed notes and lables
with open(notes_file_path, 'a+') as notes_file, open(label_file_path, 'a+') as label_file:

    # Iterate over records (== notes)
    for i in notes.index:

        # Extract text and label
        text = notes.loc[i]['text']
        label = notes.loc[i]['outcome']

        # Convert text to words
        words = text_to_words(text, 
                              filter_stopwords=True,
                              stemming=True,
                              filter_periods=True
                             )

        # Only texts with at least 2 words
        if len(words) <= 1:
            continue

        # Append to file 
        notes_file.write("{}\n".format(' '.join(words)))
        label_file.write("{}\n".format(label))

In [None]:
import gensim

In [None]:
from gensim.models.doc2vec import TaggedDocument
from itertools import islice

class TaggedDocumentStream(object):

    # Initialize with a list of note files, a list of label files, and a number of maxrows
    def __init__(self, note_files, label_files, maxrows=None):
        self.note_files = note_files
        self.label_files = label_files
        self.maxrows = maxrows

    # Yield a TaggedDocument by iterating over the lines in a file
    def yield_td(self, note_file, label_file):

        # Keep track of row count
        row_counter = 0

        # Open note file and label file
        with open(note_file) as note_file, open(label_file) as label_file:

            # Iterate over lines
            for note, label in zip(note_file, label_file):

                # Check number of files that are read
                if row_counter == self.maxrows:
                    break
                row_counter += 1

                # Yield a TaggedDocument by splitting on whitespaces, and omitting the final newline character
                yield TaggedDocument(note[:-1].split(" "), [label[:-1]])

    # Implement iteration function by iterating over all note and label files
    def __iter__(self):

        for note_file, label_file in zip(self.note_files, self.label_files):
            yield from self.yield_td(note_file, label_file)

In [None]:
from gensim.models.doc2vec import Doc2Vec

In [None]:
note_file = "hpc_space/Masterthesisdata/processed_notes/notesAll.txt"
label_file = "hpc_space/Masterthesisdata/processed_notes/labelsAll.txt"
# Define TaggedDocumentStream
notes_stream = TaggedDocumentStream(note_files=[note_file], label_files=[label_file])

# Train paragraph2vec model
paragraph2vec_model = Doc2Vec(notes_stream, 
                             epochs=20,
                             min_count=20,
                             dm=1,
                             sample=1e-3,
                             vector_size=300, 
                             window=2)

# Save model to disk
paragraph2vec_model.save("models/paragraph2vec_model_all")

In [None]:
# SVM TRAINING

In [None]:
import pandas as pd
import os
import numpy as np

import nltk.data
from nltk.tokenize import word_tokenize
import sklearn
import unidecode
import re
from gensim.models.doc2vec import TaggedDocument
from itertools import islice
from gensim.models.doc2vec import Doc2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import os
from scipy import stats


import gensim
from gensim.models import Doc2Vec

import joblib
from sklearn.metrics import roc_auc_score, roc_curve

import scipy.stats

from sklearn import svm
import joblib
from sklearn.model_selection import GridSearchCV, GroupKFold

from nltk.corpus.reader import conll

In [None]:
# Define tokenization procedure
sent_tokenizer = nltk.data.load("nltk_data/tokenizers/punkt/dutch.pickle")

def tokenize(text):
    for sentence in sent_tokenizer.tokenize(text):
        yield word_tokenize(sentence)

# Read stopwords
with open('nltk_data/corpora/stopwords/dutch') as f:
    dutch_stopwords = set(f.read().splitlines())

# Initialize stemmer (using package nltk)
stemmer = nltk.stem.snowball.DutchStemmer()

In [None]:
#################################FUNCTIONS ####################################################################
# Preprocessing for text
def text_to_words(text, filter_stopwords=True, stemming=False, filter_periods=False):

    # Lowercase and remove special characters (ë => e, etc)
    text = text.lower()
    text = unidecode.unidecode(text)

    # Remove all non space, period, lowercase
    text = re.sub(r'([^\sa-z\.]|_)+', ' ', text)

    # Remove obsolete periods
    text = re.sub(r'\s\.\s', ' ', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub('\t', ' ', text)
    text = re.sub(r' +', ' ', text)

    # Tokenize
    words = [word for sentence in tokenize(text) for word in sentence]

    # Filter stopwords
    if filter_stopwords:
        words = [word for word in words if word not in dutch_stopwords]

    # Stemming
    if stemming:
        words = [stemmer.stem(w) for w in words]

    # Filter periods
    if filter_periods:
        words = [word for word in words if word != "."]

    # Return
    return words

######################################INFERENCE ##############################################
# Convert a dataframe with texts in the 'text_column' column to a numpy array with vector representations,
# based on a paragraph2vec_model and a specified number of repetitions.
def text_to_vectors(notes_df, text_column, paragraph2vec_model, no_reps=10):

    # Output is a matrix with rows equal to number of notes, and columns equal to paragraph2vec model size
    note_vectors = np.zeros((len(notes_df), paragraph2vec_model.vector_size))

    # Iterate over all notes
    for i in notes_df.index:

        # Words are in the 'text_preprocessed' column split by whitespaces
        note_words = notes_df.loc[i, text_column].split(" ")

        # Initialize an empty vector of length paragraph2vec model size
        note_vec = np.zeros((paragraph2vec_model.vector_size))

        # Iterate over number of repetitions to cancel out inaccuracies
        for _ in range(no_reps):
            note_vec += paragraph2vec_model.infer_vector(note_words)

        # Add to note_vectors after normalizing for number of repetitions
        note_vectors[i] = (note_vec / no_reps)

    # Return output
    return note_vectors

In [None]:
paragraph2vec_model = Doc2Vec.load("models/paragraph2vec_model_all")

In [None]:
engineered_notes_train = pd.read_csv('hpc_space/MBERT1/train_data_augmented_1.csv')

# Convert text to words
engineered_notes_train['words_stemmed'] = engineered_notes_train['text'].apply(lambda x : text_to_words(x,
                                                                                    filter_stopwords=True,
                                                                                    stemming=True,
                                                                                    filter_periods=True
                                                                                    ))

# Join with whitespace
engineered_notes_train['words_stemmed'] = engineered_notes_train['words_stemmed'].apply(lambda x : ' '.join(x))

# Convert text to notes
note_vectors = text_to_vectors(engineered_notes_train, 'words_stemmed', paragraph2vec_model, 10)

# Concatenate to original dataframe
engineered_notes_train = pd.concat([engineered_notes_train, pd.DataFrame(note_vectors)], axis=1)

In [None]:
engineered_notes_test = pd.read_csv("hpc_space/MBERT1/test_data_mid_1.csv")

# Convert text to words
engineered_notes_test['words_stemmed'] = engineered_notes_test['text'].apply(lambda x : text_to_words(x,
                                                                                    filter_stopwords=True,
                                                                                    stemming=True,
                                                                                    filter_periods=True
                                                                                    ))

# Join with whitespace
engineered_notes_test['words_stemmed'] = engineered_notes_test['words_stemmed'].apply(lambda x : ' '.join(x))

# Convert text to notes
note_vectors_test = text_to_vectors(engineered_notes_test, 'words_stemmed', paragraph2vec_model, 10)

# Concatenate to original dataframe
engineered_notes_test = pd.concat([engineered_notes_test, pd.DataFrame(note_vectors_test)], axis=1)

In [None]:
engineered_notes_train.to_csv("engineered_notes_train_all_trained_5.csv")
engineered_notes_test.to_csv("engineered_notes_test_all_trained_5.csv")

In [None]:
engineered_notes_train = pd.read_csv("engineered_notes_train_all_trained_5.csv")
engineered_notes_test = pd.read_csv("engineered_notes_test_all_trained_5.csv")

In [None]:
# Subset data from engineered_notes_train for training
X_train = engineered_notes_train[[str(a) for a in range(300)]].values
y_train = engineered_notes_train[['outcome', 'Geslacht']]

# Subset data from engineered_notes_test for testing
X_test = engineered_notes_test[[str(a) for a in range(300)]].values
y_test = engineered_notes_test[['outcome', 'Geslacht']]

In [None]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

svm_params = [{
    'clf__C': [0.1, 1, 10],
    'clf__kernel': ['rbf'],

}]

pipeline = Pipeline(steps=[("preprocesser", StandardScaler()), ("clf", SVC(random_state=0, probability=True))])
grid = GridSearchCV(pipeline, svm_params, scoring='f1_macro', verbose=1, cv=3)

grid.fit(X_train, y_train['outcome'])

svm_model = grid.best_estimator_

In [None]:
# Store predictions and true labels in a DataFrame for testing data
predictions = pd.DataFrame({
    'probability': svm_model.predict_proba(X_test)[:, 1],
    'pred_label': svm_model.predict(X_test),
    'true_label': y_test['outcome'],
    'Geslacht': y_test['Geslacht'],
    'fold_number': 1
})

predictions.to_csv(f"predictions_alltrained_5.csv", sep=";", index=False)

model_filename = f"alltrained_model5.joblib"
joblib.dump(svm_model, model_filename)

In [None]:
final_df = pd.read_csv("predictions_alltrained_5.csv", sep=";")

In [None]:
# EVALUATION

In [None]:
final_df = final_df.rename(columns={"pred_label": "final_prediction"})
final_df = final_df.rename(columns={"true_label": "label"})

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Calculate accuracy for the positive class (label = 1)
accuracy_positive = accuracy_score(final_df['label'], final_df['final_prediction'])

# Calculate precision for the positive class (label = 1)
precision_positive = precision_score(final_df['label'], final_df['final_prediction'], pos_label=1)

# Calculate recall for the positive class (label = 1)
recall_positive = recall_score(final_df['label'], final_df['final_prediction'], pos_label=1)

# Calculate F1 score for the positive class (label = 1)
f1_positive = f1_score(final_df['label'], final_df['final_prediction'], pos_label=1)

# Calculate precision for the negative class (label = 0)
precision_negative = precision_score(final_df['label'], final_df['final_prediction'], pos_label=0)

# Calculate recall for the negative class (label = 0)
recall_negative = recall_score(final_df['label'], final_df['final_prediction'], pos_label=0)

# Calculate F1 score for the negative class (label = 0)
f1_negative = f1_score(final_df['label'], final_df['final_prediction'], pos_label=0)

# Calculate the AUC
auc = roc_auc_score(final_df['label'], final_df['final_prediction'])

# Calculate the AUC for males (Geslacht = 1)
auc_male = roc_auc_score(final_df[final_df['Geslacht'] == 1]['label'], final_df[final_df['Geslacht'] == 1]['final_prediction'])

# Calculate the AUC for females (Geslacht = 0)
auc_female = roc_auc_score(final_df[final_df['Geslacht'] == 0]['label'], final_df[final_df['Geslacht'] == 0]['final_prediction'])


# Print the calculated metrics separately for both classes
print(f"Accuracy (Overall): {accuracy_positive:.4f}")
print(f"Precision (Positive): {precision_positive:.4f}")
print(f"Recall (Positive): {recall_positive:.4f}")
print(f"F1 Score (Positive): {f1_positive:.4f}")
print(f"Precision (Negative): {precision_negative:.4f}")
print(f"Recall (Negative): {recall_negative:.4f}")
print(f"F1 Score (Negative): {f1_negative:.4f}")
print(f"AUC: {auc:.4f}")
print(f"AUC (Male): {auc_male:.4f}")
print(f"AUC (Female): {auc_female:.4f}")

In [None]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix for the entire dataset
cm = confusion_matrix(final_df['label'], final_df['final_prediction'])

# Calculate TPR and FPR for male (Geslacht = 1)
male_indices = final_df['Geslacht'] == 1
cm_male = confusion_matrix(final_df[male_indices]['label'], final_df[male_indices]['final_prediction'])

tpr_male = cm_male[1, 1] / (cm_male[1, 0] + cm_male[1, 1])
fpr_male = cm_male[0, 1] / (cm_male[0, 0] + cm_male[0, 1])

# Calculate TPR and FPR for female (Geslacht = 0)
female_indices = final_df['Geslacht'] == 0
cm_female = confusion_matrix(final_df[female_indices]['label'], final_df[female_indices]['final_prediction'])

tpr_female = cm_female[1, 1] / (cm_female[1, 0] + cm_female[1, 1])
fpr_female = cm_female[0, 1] / (cm_female[0, 0] + cm_female[0, 1])

# Print the calculated metrics separately for both classes
print(f"TPR (Male): {tpr_male:.4f}")
print(f"TPR (Female): {tpr_female:.4f}")
print(f"FPR (Male): {fpr_male:.4f}")
print(f"FPR (Female): {fpr_female:.4f}")

In [None]:
final_df['combined'] = final_df['Geslacht'].astype(str) + '_' + final_df['label'].astype(str) + '_' + final_df['final_prediction'].astype(str)

# Get the count of combinations
combination_counts = final_df['combined'].value_counts()

# Print the counts
print("Combined Counts:")
print(combination_counts)