In [1]:
import pandas as pd

In [48]:
predictions = pd.read_csv("hpc_space/Results/MBERT/MBERT_augmented_predictions_5.csv")

In [49]:
final_df = predictions.iloc[:len(predictions) // 2]

In [50]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Calculate accuracy for the positive class (label = 1)
accuracy_positive = accuracy_score(final_df['label'], final_df['final_prediction'])

# Calculate precision for the positive class (label = 1)
precision_positive = precision_score(final_df['label'], final_df['final_prediction'], pos_label=1)

# Calculate recall for the positive class (label = 1)
recall_positive = recall_score(final_df['label'], final_df['final_prediction'], pos_label=1)

# Calculate F1 score for the positive class (label = 1)
f1_positive = f1_score(final_df['label'], final_df['final_prediction'], pos_label=1)

# Calculate precision for the negative class (label = 0)
precision_negative = precision_score(final_df['label'], final_df['final_prediction'], pos_label=0)

# Calculate recall for the negative class (label = 0)
recall_negative = recall_score(final_df['label'], final_df['final_prediction'], pos_label=0)

# Calculate F1 score for the negative class (label = 0)
f1_negative = f1_score(final_df['label'], final_df['final_prediction'], pos_label=0)

# Calculate the AUC
auc = roc_auc_score(final_df['label'], final_df['final_prediction'])

# Calculate the AUC for males (Geslacht = 1)
auc_male = roc_auc_score(final_df[final_df['Geslacht'] == 1]['label'], final_df[final_df['Geslacht'] == 1]['final_prediction'])

# Calculate the AUC for females (Geslacht = 0)
auc_female = roc_auc_score(final_df[final_df['Geslacht'] == 0]['label'], final_df[final_df['Geslacht'] == 0]['final_prediction'])


# Print the calculated metrics separately for both classes
print(f"Accuracy (Overall): {accuracy_positive:.4f}")
print(f"Precision (Positive): {precision_positive:.4f}")
print(f"Recall (Positive): {recall_positive:.4f}")
print(f"F1 Score (Positive): {f1_positive:.4f}")
print(f"Precision (Negative): {precision_negative:.4f}")
print(f"Recall (Negative): {recall_negative:.4f}")
print(f"F1 Score (Negative): {f1_negative:.4f}")
print(f"AUC: {auc:.4f}")
print(f"AUC (Male): {auc_male:.4f}")
print(f"AUC (Female): {auc_female:.4f}")

Accuracy (Overall): 0.9054
Precision (Positive): 0.5135
Recall (Positive): 0.2676
F1 Score (Positive): 0.3519
Precision (Negative): 0.9260
Recall (Negative): 0.9731
F1 Score (Negative): 0.9490
AUC: 0.6203
AUC (Male): 0.6423
AUC (Female): 0.5239


In [51]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix for the entire dataset
cm = confusion_matrix(final_df['label'], final_df['final_prediction'])

# Calculate TPR and FPR for male (Geslacht = 1)
male_indices = final_df['Geslacht'] == 1
cm_male = confusion_matrix(final_df[male_indices]['label'], final_df[male_indices]['final_prediction'])

tpr_male = cm_male[1, 1] / (cm_male[1, 0] + cm_male[1, 1])
fpr_male = cm_male[0, 1] / (cm_male[0, 0] + cm_male[0, 1])

# Calculate TPR and FPR for female (Geslacht = 0)
female_indices = final_df['Geslacht'] == 0
cm_female = confusion_matrix(final_df[female_indices]['label'], final_df[female_indices]['final_prediction'])

tpr_female = cm_female[1, 1] / (cm_female[1, 0] + cm_female[1, 1])
fpr_female = cm_female[0, 1] / (cm_female[0, 0] + cm_female[0, 1])

# Print the calculated metrics separately for both classes
print(f"TPR (Male): {tpr_male:.4f}")
print(f"TPR (Female): {tpr_female:.4f}")
print(f"FPR (Male): {fpr_male:.4f}")
print(f"FPR (Female): {fpr_female:.4f}")

TPR (Male): 0.3158
TPR (Female): 0.0714
FPR (Male): 0.0312
FPR (Female): 0.0236


In [52]:
final_df['combined'] = final_df['Geslacht'].astype(str) + '_' + final_df['label'].astype(str) + '_' + final_df['final_prediction'].astype(str)

# Get the count of combinations
combination_counts = final_df['combined'].value_counts()

# Print the counts
print("Combined Counts:")
print(combination_counts)

Combined Counts:
combined
0_0_0    372
1_0_0    279
1_1_0     39
1_1_1     18
0_1_0     13
0_0_1      9
1_0_1      9
0_1_1      1
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['combined'] = final_df['Geslacht'].astype(str) + '_' + final_df['label'].astype(str) + '_' + final_df['final_prediction'].astype(str)
