In [1]:
# SVM LOADING

In [2]:
import pandas as pd
import os
import numpy as np

import nltk.data
from nltk.tokenize import word_tokenize
import sklearn
import unidecode
import re
from gensim.models.doc2vec import TaggedDocument
from itertools import islice
from gensim.models.doc2vec import Doc2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import os
from scipy import stats


import gensim
from gensim.models import Doc2Vec

import joblib
from sklearn.metrics import roc_auc_score, roc_curve

import scipy.stats

from sklearn import svm
import joblib
from sklearn.model_selection import GridSearchCV, GroupKFold

from nltk.corpus.reader import conll

In [3]:
# Define tokenization procedure
sent_tokenizer = nltk.data.load("nltk_data/tokenizers/punkt/dutch.pickle")

def tokenize(text):
    for sentence in sent_tokenizer.tokenize(text):
        yield word_tokenize(sentence)

# Read stopwords
with open('nltk_data/corpora/stopwords/dutch') as f:
    dutch_stopwords = set(f.read().splitlines())

# Initialize stemmer (using package nltk)
stemmer = nltk.stem.snowball.DutchStemmer()

In [4]:
#################################FUNCTIONS ####################################################################
# Preprocessing for text
def text_to_words(text, filter_stopwords=True, stemming=False, filter_periods=False):

    # Lowercase and remove special characters (ë => e, etc)
    text = text.lower()
    text = unidecode.unidecode(text)

    # Remove all non space, period, lowercase
    text = re.sub(r'([^\sa-z\.]|_)+', ' ', text)

    # Remove obsolete periods
    text = re.sub(r'\s\.\s', ' ', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub('\t', ' ', text)
    text = re.sub(r' +', ' ', text)

    # Tokenize
    words = [word for sentence in tokenize(text) for word in sentence]

    # Filter stopwords
    if filter_stopwords:
        words = [word for word in words if word not in dutch_stopwords]

    # Stemming
    if stemming:
        words = [stemmer.stem(w) for w in words]

    # Filter periods
    if filter_periods:
        words = [word for word in words if word != "."]

    # Return
    return words

######################################INFERENCE ##############################################
# Convert a dataframe with texts in the 'text_column' column to a numpy array with vector representations,
# based on a paragraph2vec_model and a specified number of repetitions.
def text_to_vectors(notes_df, text_column, paragraph2vec_model, no_reps=10):

    # Output is a matrix with rows equal to number of notes, and columns equal to paragraph2vec model size
    note_vectors = np.zeros((len(notes_df), paragraph2vec_model.vector_size))

    # Iterate over all notes
    for i in notes_df.index:

        # Words are in the 'text_preprocessed' column split by whitespaces
        note_words = notes_df.loc[i, text_column].split(" ")

        # Initialize an empty vector of length paragraph2vec model size
        note_vec = np.zeros((paragraph2vec_model.vector_size))

        # Iterate over number of repetitions to cancel out inaccuracies
        for _ in range(no_reps):
            note_vec += paragraph2vec_model.infer_vector(note_words)

        # Add to note_vectors after normalizing for number of repetitions
        note_vectors[i] = (note_vec / no_reps)

    # Return output
    return note_vectors

In [None]:
paragraph2vec_model = Doc2Vec.load("models/paragraph2vec_model_all")

In [None]:
engineered_notes_test = pd.read_csv("hpc_space/test_data_5_augmented.csv")

# Convert text to words
engineered_notes_test['words_stemmed'] = engineered_notes_test['text'].apply(lambda x : text_to_words(x,
                                                                                    filter_stopwords=True,
                                                                                    stemming=True,
                                                                                    filter_periods=True
                                                                                    ))

# Join with whitespace
engineered_notes_test['words_stemmed'] = engineered_notes_test['words_stemmed'].apply(lambda x : ' '.join(x))

# Convert text to notes
note_vectors_test = text_to_vectors(engineered_notes_test, 'words_stemmed', paragraph2vec_model, 10)

# Concatenate to original dataframe
engineered_notes_test = pd.concat([engineered_notes_test, pd.DataFrame(note_vectors_test)], axis=1)

In [None]:
engineered_notes_test.to_csv("hpc_space/SVM Doc2Vec/New_engineered_test_and_val/engineered_notes_newaugmentedtest_5.csv")

In [None]:
engineered_notes_val = pd.read_csv("hpc_space/val_data_5.csv")

# Convert text to words
engineered_notes_val['words_stemmed'] = engineered_notes_val['text'].apply(lambda x : text_to_words(x,
                                                                                    filter_stopwords=True,
                                                                                    stemming=True,
                                                                                    filter_periods=True
                                                                                    ))

# Join with whitespace
engineered_notes_val['words_stemmed'] = engineered_notes_val['words_stemmed'].apply(lambda x : ' '.join(x))

# Convert text to notes
note_vectors_val = text_to_vectors(engineered_notes_val, 'words_stemmed', paragraph2vec_model, 10)

# Concatenate to original dataframe
engineered_notes_val = pd.concat([engineered_notes_val, pd.DataFrame(note_vectors_val)], axis=1)

In [None]:
engineered_notes_val.to_csv("hpc_space/SVM Doc2Vec/New_engineered_test_and_val/engineered_notes_newval_5.csv")

In [82]:
engineered_notes_test = pd.read_csv("hpc_space/SVM Doc2Vec/New_engineered_test_and_val/engineered_notes_newval_5.csv")

In [83]:
# Subset data from engineered_notes_test for testing
X_test = engineered_notes_test[[str(a) for a in range(300)]].values
y_test = engineered_notes_test[['outcome', 'Geslacht']]

In [84]:
import joblib

model_filename = "alltrained_neutralized_5.joblib"
svm_model = joblib.load(model_filename)

In [85]:
# Store predictions and true labels in a DataFrame for testing data
predictions = pd.DataFrame({
    'predicted_probabilities': svm_model.predict_proba(X_test)[:, 1],
    'final_prediction': svm_model.predict(X_test),
    'label': y_test['outcome'],
    'Geslacht': y_test['Geslacht'],
    'fold_number': 1
})

predictions.to_csv(f"hpc_space/Results/NEW RESULTS/NEW RESULTS/Doc2Vec SVM/Genderneutral/SVMDoc2Vec_genderneutral5_val_predictions.csv", sep=";", index=False)

In [87]:
final_df = pd.read_csv("hpc_space/Results/NEW RESULTS/NEW RESULTS/Doc2Vec SVM/Genderneutral/SVMDoc2Vec_genderneutral3_val_predictions.csv", sep=";")

FileNotFoundError: [Errno 2] No such file or directory: 'hpc_space/Results/NEW RESULTS/NEW RESULTS/Doc2Vec SVM/Orig/hpc_space/Results/NEW RESULTS/NEW RESULTS/Doc2Vec SVM/Genderneutral/SVMDoc2Vec_genderneutral5_val_predictions.csv'

In [None]:
# EVALUATION

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Calculate accuracy for the positive class (label = 1)
accuracy_positive = accuracy_score(final_df['label'], final_df['final_prediction'])

# Calculate precision for the positive class (label = 1)
precision_positive = precision_score(final_df['label'], final_df['final_prediction'], pos_label=1)

# Calculate recall for the positive class (label = 1)
recall_positive = recall_score(final_df['label'], final_df['final_prediction'], pos_label=1)

# Calculate F1 score for the positive class (label = 1)
f1_positive = f1_score(final_df['label'], final_df['final_prediction'], pos_label=1)

# Calculate precision for the negative class (label = 0)
precision_negative = precision_score(final_df['label'], final_df['final_prediction'], pos_label=0)

# Calculate recall for the negative class (label = 0)
recall_negative = recall_score(final_df['label'], final_df['final_prediction'], pos_label=0)

# Calculate F1 score for the negative class (label = 0)
f1_negative = f1_score(final_df['label'], final_df['final_prediction'], pos_label=0)

# Calculate the AUC
auc = roc_auc_score(final_df['label'], final_df['final_prediction'])

# Calculate the AUC for males (Geslacht = 1)
auc_male = roc_auc_score(final_df[final_df['Geslacht'] == 1]['label'], final_df[final_df['Geslacht'] == 1]['final_prediction'])

# Calculate the AUC for females (Geslacht = 0)
auc_female = roc_auc_score(final_df[final_df['Geslacht'] == 0]['label'], final_df[final_df['Geslacht'] == 0]['final_prediction'])


# Print the calculated metrics separately for both classes
print(f"Accuracy (Overall): {accuracy_positive:.4f}")
print(f"Precision (Positive): {precision_positive:.4f}")
print(f"Recall (Positive): {recall_positive:.4f}")
print(f"F1 Score (Positive): {f1_positive:.4f}")
print(f"Precision (Negative): {precision_negative:.4f}")
print(f"Recall (Negative): {recall_negative:.4f}")
print(f"F1 Score (Negative): {f1_negative:.4f}")
print(f"AUC: {auc:.4f}")
print(f"AUC (Male): {auc_male:.4f}")
print(f"AUC (Female): {auc_female:.4f}")

In [None]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix for the entire dataset
cm = confusion_matrix(final_df['label'], final_df['final_prediction'])

# Calculate TPR and FPR for male (Geslacht = 1)
male_indices = final_df['Geslacht'] == 1
cm_male = confusion_matrix(final_df[male_indices]['label'], final_df[male_indices]['final_prediction'])

tpr_male = cm_male[1, 1] / (cm_male[1, 0] + cm_male[1, 1])
fpr_male = cm_male[0, 1] / (cm_male[0, 0] + cm_male[0, 1])

# Calculate TPR and FPR for female (Geslacht = 0)
female_indices = final_df['Geslacht'] == 0
cm_female = confusion_matrix(final_df[female_indices]['label'], final_df[female_indices]['final_prediction'])

tpr_female = cm_female[1, 1] / (cm_female[1, 0] + cm_female[1, 1])
fpr_female = cm_female[0, 1] / (cm_female[0, 0] + cm_female[0, 1])

# Print the calculated metrics separately for both classes
print(f"TPR (Male): {tpr_male:.4f}")
print(f"TPR (Female): {tpr_female:.4f}")
print(f"FPR (Male): {fpr_male:.4f}")
print(f"FPR (Female): {fpr_female:.4f}")

In [None]:
final_df['combined'] = final_df['Geslacht'].astype(str) + '_' + final_df['label'].astype(str) + '_' + final_df['final_prediction'].astype(str)

# Get the count of combinations
combination_counts = final_df['combined'].value_counts()

# Define the desired order of combinations
desired_order = ['0_0_0', '1_0_0', '0_0_1', '1_0_1', '0_1_0', '1_1_0', '0_1_1', '1_1_1']

# Sort the combination_counts DataFrame based on the custom order
sorted_combination_counts = combination_counts.loc[desired_order]

# Print the sorted counts
print("Combined Counts:")
print(sorted_combination_counts)

# # Print the counts
# print("Combined Counts:")
# print(combination_counts)

In [None]:
# Split the DataFrame into first_half and second_half
first_half = final_df.iloc[:len(final_df) // 2]
second_half = final_df.iloc[len(final_df) // 2:]

first_half.reset_index(drop=True, inplace=True)
second_half.reset_index(drop=True, inplace=True)

# Compare the 'final_prediction' values in the two halves and calculate value counts
mismatch_counts = (first_half['final_prediction'] != second_half['final_prediction']).value_counts()

# Create a DataFrame to display the mismatch counts
mismatches = pd.DataFrame({'Mismatches': mismatch_counts})
print(mismatches)

In [None]:
final_df = first_half

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Calculate accuracy for the positive class (label = 1)
accuracy_positive = accuracy_score(first_half['label'], first_half['final_prediction'])

# Calculate precision for the positive class (label = 1)
precision_positive = precision_score(first_half['label'], first_half['final_prediction'], pos_label=1)

# Calculate recall for the positive class (label = 1)
recall_positive = recall_score(first_half['label'], first_half['final_prediction'], pos_label=1)

# Calculate F1 score for the positive class (label = 1)
f1_positive = f1_score(first_half['label'], first_half['final_prediction'], pos_label=1)

# Calculate precision for the negative class (label = 0)
precision_negative = precision_score(first_half['label'], first_half['final_prediction'], pos_label=0)

# Calculate recall for the negative class (label = 0)
recall_negative = recall_score(first_half['label'], first_half['final_prediction'], pos_label=0)

# Calculate F1 score for the negative class (label = 0)
f1_negative = f1_score(first_half['label'], first_half['final_prediction'], pos_label=0)

# Calculate the AUC
auc = roc_auc_score(first_half['label'], first_half['final_prediction'])

# Calculate the AUC for males (Geslacht = 1)
auc_male = roc_auc_score(first_half[first_half['Geslacht'] == 1]['label'], first_half[first_half['Geslacht'] == 1]['final_prediction'])

# Calculate the AUC for females (Geslacht = 0)
auc_female = roc_auc_score(first_half[first_half['Geslacht'] == 0]['label'], first_half[first_half['Geslacht'] == 0]['final_prediction'])


# Print the calculated metrics separately for both classes
print(f"Accuracy (Overall): {accuracy_positive:.4f}")
print(f"Precision (Positive): {precision_positive:.4f}")
print(f"Recall (Positive): {recall_positive:.4f}")
print(f"F1 Score (Positive): {f1_positive:.4f}")
print(f"Precision (Negative): {precision_negative:.4f}")
print(f"Recall (Negative): {recall_negative:.4f}")
print(f"F1 Score (Negative): {f1_negative:.4f}")
print(f"AUC: {auc:.4f}")
print(f"AUC (Male): {auc_male:.4f}")
print(f"AUC (Female): {auc_female:.4f}")

In [None]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix for the entire dataset
cm = confusion_matrix(first_half['label'], first_half['final_prediction'])

# Calculate TPR and FPR for male (Geslacht = 1)
male_indices = first_half['Geslacht'] == 1
cm_male = confusion_matrix(first_half[male_indices]['label'], first_half[male_indices]['final_prediction'])

tpr_male = cm_male[1, 1] / (cm_male[1, 0] + cm_male[1, 1])
fpr_male = cm_male[0, 1] / (cm_male[0, 0] + cm_male[0, 1])

# Calculate TPR and FPR for female (Geslacht = 0)
female_indices = first_half['Geslacht'] == 0
cm_female = confusion_matrix(first_half[female_indices]['label'], first_half[female_indices]['final_prediction'])

tpr_female = cm_female[1, 1] / (cm_female[1, 0] + cm_female[1, 1])
fpr_female = cm_female[0, 1] / (cm_female[0, 0] + cm_female[0, 1])

# Print the calculated metrics separately for both classes
print(f"TPR (Male): {tpr_male:.4f}")
print(f"TPR (Female): {tpr_female:.4f}")
print(f"FPR (Male): {fpr_male:.4f}")
print(f"FPR (Female): {fpr_female:.4f}")

In [None]:
first_half['combined'] = first_half['Geslacht'].astype(str) + '_' + first_half['label'].astype(str) + '_' + first_half['final_prediction'].astype(str)

# Get the count of combinations
combination_counts = first_half['combined'].value_counts()

# Your original combinations
combinations = ['0_0_0', '1_0_0', '0_0_1', '1_0_1', '0_1_0', '1_1_0', '0_1_1', '1_1_1']

# Create a DataFrame with the desired combinations
desired_df = pd.DataFrame({'combined': combinations})

# Merge with the actual counts (use left join to keep all combinations)
merged_counts = pd.merge(desired_df, combination_counts.to_frame(), how='left', left_on='combined', right_index=True)

# Fill NaN values with 0 (for combinations not present in the original DataFrame)
merged_counts['combined'] = merged_counts['combined'].fillna(0).astype(int)

# Print the combined counts
print("Combined Counts:")
print(merged_counts)