In [None]:
import pandas as pd
df = pd.read_csv('df_with_VIAF_embeddings_aug2.csv')

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix, classification_report

df['status'] = df['status'].str.replace('zombie', '1')
df['status'] = df['status'].fillna('0')
df['status'] = df['status'].str.replace('not-born', '2')
df['status'] = df['status'].str.replace('not_born', '2')
df['status'] = df['status'].str.replace('not born', '2')



df = df.drop(columns=['S2_embeddings'])  # Features
df = df.drop(columns=['VIAF_embeddings'])  # Features
df = df.drop(columns=['S2_titlelist'])  # Features
df = df.drop(columns=['VIAF_titlelist'])  # Features
df = df.drop(columns=['selected_birthyear'])  # Features




X = df.drop(columns=['match?'])  # Features
y = df['match?']  # Target


y = (y > y.mean()).astype(int)  # Binarize the target

# Initialize the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5)

# Define the range of thresholds
thresholds = np.linspace(0, 1, 100)

# Initialize lists to store metrics across all folds
precision_scores = np.zeros((len(thresholds), cv.get_n_splits()))

# # Perform cross-validation
# for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = y[train_idx], y[test_idx]

# Perform cross-validation
for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train the model
    rf.fit(X_train, y_train)

    # Get predicted probabilities
    y_proba = rf.predict_proba(X_test)[:, 1]

    # Calculate precision for each threshold
    for i, threshold in enumerate(thresholds):
        y_pred_binary = (y_proba >= threshold).astype(int)
        precision_scores[i, fold_idx] = precision_score(y_test, y_pred_binary, pos_label=1)

# Average precision across all folds
avg_precision_scores = precision_scores.mean(axis=1)

# Select the optimal threshold
optimal_idx = np.argmax(avg_precision_scores)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal Threshold for Precision: {optimal_threshold}")
print(f"Highest Average Precision: {avg_precision_scores[optimal_idx]}")

# Optional: Train the final model on the entire dataset and evaluate
rf.fit(X, y)
y_proba_final = rf.predict_proba(X)[:, 1]
y_pred_final = (y_proba_final >= optimal_threshold).astype(int)
print(classification_report(y, y_pred_final))

In [None]:
print(classification_report(y, y_pred_final))

In [5]:
# Output the results for each fold
for result in results:
    print(f"Fold {result['fold']} - Best Threshold: {result['best_threshold']}, Best Precision: {result['best_precision']}")


Fold 1 - Best Threshold: 0.020202020202020204, Best Precision: 0.24166666666666667
Fold 2 - Best Threshold: 0.9292929292929294, Best Precision: 0.5
Fold 3 - Best Threshold: 0.25252525252525254, Best Precision: 0.25
Fold 4 - Best Threshold: 0.7676767676767677, Best Precision: 1.0
Fold 5 - Best Threshold: 0.6161616161616162, Best Precision: 0.35294117647058826


In [6]:
optimal_threshold

0.7676767676767677

In [10]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score

# Assuming df is your DataFrame
X = df.drop(columns=['match?'])  # Features
y = df['match?']  # Target

# Binarize the target if needed
y = (y > y.mean()).astype(int)

# Impute missing values
imputer = SimpleImputer(strategy='mean')  # or 'median', 'most_frequent', etc.
X_imputed = imputer.fit_transform(X)

# Initialize the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5)

# Define the range of thresholds
thresholds = np.linspace(0, 1, 100)

# Initialize lists to store metrics for training and test sets
train_precision_scores = []
test_precision_scores = []

# Perform cross-validation
for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_imputed, y)):
    X_train, X_test = X_imputed[train_idx], X_imputed[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Apply SMOTE to the training data only
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Train the model on the resampled training set
    rf.fit(X_train_resampled, y_train_resampled)

    # Get predicted probabilities for the training set
    y_proba_train = rf.predict_proba(X_train_resampled)[:, 1]

    # Determine the best threshold based on training set
    best_threshold = None
    best_precision_train = 0

    for threshold in thresholds:
        y_pred_train = (y_proba_train >= threshold).astype(int)
        precision_train = precision_score(y_train_resampled, y_pred_train, pos_label=1)

        if precision_train > best_precision_train:
            best_precision_train = precision_train
            best_threshold = threshold

    # Store the best precision for the training set
    train_precision_scores.append(best_precision_train)

    # Now use the best threshold to make predictions on the test set
    y_proba_test = rf.predict_proba(X_test)[:, 1]
    y_pred_test = (y_proba_test >= best_threshold).astype(int)

    # Calculate and store precision for the test set
    precision_test = precision_score(y_test, y_pred_test, pos_label=1)
    test_precision_scores.append(precision_test)

    print(f"Fold {fold_idx + 1} - Best Threshold: {best_threshold}")
    print(f"Precision on Training Set: {best_precision_train}")
    print(f"Precision on Test Set: {precision_test}\n")

# Calculate the average precision across all folds
avg_train_precision = np.mean(train_precision_scores)
avg_test_precision = np.mean(test_precision_scores)

print(f"Average Precision on Training Set: {avg_train_precision}")
print(f"Average Precision on Test Set: {avg_test_precision}")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 1 - Best Threshold: 0.42424242424242425
Precision on Training Set: 1.0
Precision on Test Set: 0.0

Fold 2 - Best Threshold: 0.38383838383838387
Precision on Training Set: 1.0
Precision on Test Set: 0.27692307692307694

Fold 3 - Best Threshold: 0.393939393939394
Precision on Training Set: 1.0
Precision on Test Set: 0.24561403508771928

Fold 4 - Best Threshold: 0.36363636363636365
Precision on Training Set: 1.0
Precision on Test Set: 0.35294117647058826

Fold 5 - Best Threshold: 0.38383838383838387
Precision on Training Set: 1.0
Precision on Test Set: 0.27722772277227725

Average Precision on Training Set: 1.0
Average Precision on Test Set: 0.23054120225073235


In [13]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, classification_report

# Assuming df is your DataFrame
X = df.drop(columns=['match?'])  # Features
y = df['match?']  # Target

# Binarize the target if needed
y = (y > y.mean()).astype(int)

# Impute missing values
imputer = SimpleImputer(strategy='mean')  # or 'median', 'most_frequent', etc.
X_imputed = imputer.fit_transform(X)

# Initialize the Random Forest model with class weights
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5)

# Define the range of thresholds
thresholds = np.linspace(0, 1, 100)

# Initialize lists to store metrics for training and test sets
train_precision_scores = []
test_precision_scores = []
test_recall_scores = []

# Initialize lists to store counts of true labels
test_class_counts = []

# Perform cross-validation
for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_imputed, y)):
    X_train, X_test = X_imputed[train_idx], X_imputed[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Count the number of instances for each class in the test set
    class_counts = y_test.value_counts()
    test_class_counts.append(class_counts)
    
    print(f"Fold {fold_idx + 1} - Test Set Class Distribution:")
    print(class_counts)
    print()

    # Apply SMOTE to the training data only
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Train the model on the resampled training set
    rf.fit(X_train_resampled, y_train_resampled)

    # Get predicted probabilities for the training set
    y_proba_train = rf.predict_proba(X_train_resampled)[:, 1]

    # Determine the best threshold based on training set
    best_threshold = None
    best_precision_train = 0

    for threshold in thresholds:
        y_pred_train = (y_proba_train >= threshold).astype(int)
        precision_train = precision_score(y_train_resampled, y_pred_train, pos_label=1, zero_division=0)

        if precision_train > best_precision_train:
            best_precision_train = precision_train
            best_threshold = threshold

    # Store the best precision for the training set
    train_precision_scores.append(best_precision_train)

    # Now use the best threshold to make predictions on the test set
    y_proba_test = rf.predict_proba(X_test)[:, 1]
    y_pred_test = (y_proba_test >= best_threshold).astype(int)

    # Calculate and store precision and recall for the test set
    precision_test = precision_score(y_test, y_pred_test, pos_label=1, zero_division=0)
    recall_test = recall_score(y_test, y_pred_test, pos_label=1, zero_division=0)
    test_precision_scores.append(precision_test)
    test_recall_scores.append(recall_test)

    print(f"Fold {fold_idx + 1} - Best Threshold: {best_threshold}")
    print(f"Precision on Training Set: {best_precision_train}")
    print(f"Precision on Test Set: {precision_test}")
    print(f"Recall on Test Set: {recall_test}\n")

# Calculate the average precision and recall across all folds
avg_train_precision = np.mean(train_precision_scores)
avg_test_precision = np.mean(test_precision_scores)
avg_test_recall = np.mean(test_recall_scores)

print(f"Average Precision on Training Set: {avg_train_precision}")
print(f"Average Precision on Test Set: {avg_test_precision}")
print(f"Average Recall on Test Set: {avg_test_recall}")

# Display aggregated class distribution across all folds
all_class_counts = pd.concat(test_class_counts, axis=1)
all_class_counts.columns = [f'Fold {i + 1}' for i in range(len(test_class_counts))]
print("Aggregated Test Set Class Distribution Across Folds:")
print(all_class_counts)


Fold 1 - Test Set Class Distribution:
match?
0    132
1     31
Name: count, dtype: int64

Fold 1 - Best Threshold: 0.42424242424242425
Precision on Training Set: 1.0
Precision on Test Set: 0.0
Recall on Test Set: 0.0

Fold 2 - Test Set Class Distribution:
match?
0    132
1     30
Name: count, dtype: int64

Fold 2 - Best Threshold: 0.38383838383838387
Precision on Training Set: 1.0
Precision on Test Set: 0.27692307692307694
Recall on Test Set: 0.6

Fold 3 - Test Set Class Distribution:
match?
0    132
1     30
Name: count, dtype: int64

Fold 3 - Best Threshold: 0.393939393939394
Precision on Training Set: 1.0
Precision on Test Set: 0.24561403508771928
Recall on Test Set: 0.4666666666666667

Fold 4 - Test Set Class Distribution:
match?
0    131
1     31
Name: count, dtype: int64

Fold 4 - Best Threshold: 0.36363636363636365
Precision on Training Set: 1.0
Precision on Test Set: 0.35294117647058826
Recall on Test Set: 0.7741935483870968

Fold 5 - Test Set Class Distribution:
match?
0    13