In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, precision_score, recall_score, f1_score
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import pymc3 as pm



In [2]:
# Load the CSV file
df = pd.read_csv(r'C:\Programmieren\Jupyter Lab\Datasets\adult.csv')

In [3]:
# Create a LabelEncoder object
le = LabelEncoder()

# List of attributes to encode
attributes_to_encode = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']

# Apply the encoder to each attribute
for attribute in attributes_to_encode:
    df[attribute] = le.fit_transform(df[attribute])

# Display the DataFrame
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,4,226802,1,7,4,7,3,2,1,0,0,40,39,0
1,38,4,89814,11,9,2,5,0,4,1,0,0,50,39,0
2,28,2,336951,7,12,2,11,0,4,1,0,0,40,39,1
3,44,4,160323,15,10,2,7,0,2,1,7688,0,40,39,1
4,18,0,103497,15,10,4,0,3,4,0,0,0,30,39,0


In [4]:
# Defining the uncertain features
uncertain_features = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status', 'occupation',
                      'relationship', 'capital-gain', 'capital-loss', 'hours-per-week']

# List of thresholds
thresholds = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# A dictionary to store the modified dataframes
modified_dfs = {}

# Inserting missing values
for thresh in thresholds:
    # Create a copy of the original dataframe
    df_copy = df.copy()
    
    # For each uncertain feature, set a proportion of the data to NaN at random
    for feature in uncertain_features:
        # Find the number of entries to set to NaN
        num_nan = int(thresh * len(df_copy[feature]))
        
        # Generate random indices for NaN insertion
        nan_indices = np.random.choice(df_copy.index, num_nan, replace=False)
        
        # Insert NaNs
        df_copy.loc[nan_indices, feature] = np.nan
    
    # Store the modified dataframe in the dictionary
    modified_dfs[f"df_{int(thresh*100)}"] = df_copy

In [5]:
df_90 = modified_dfs["df_90"]

# Categorization

In [6]:
# Perform imputation using IterativeImputer
imputer = IterativeImputer(random_state=42)
df_imputed = pd.DataFrame(imputer.fit_transform(df_90), columns=df_90.columns)

# Remove missing values from all columns in the imputed DataFrame
df_imputed_cleaned = df_imputed.dropna()

# Check if the DataFrame has any remaining rows
if df_imputed_cleaned.shape[0] < 1:
    print("Insufficient data points after imputation and dropping missing values.")
else:
    # Splitting the data into features (X) and target (y)
    X = df_imputed_cleaned.drop(columns=['income'])  # Remove 'income' if it exists
    y = df_imputed_cleaned['income']

    best_accuracy = 0.0
    best_bins = None

    # Iterate over different numbers of bins
    for num_bins in range(4, 41):
        # Perform quantile binning on the 'age' variable
        df_imputed_cleaned['age_category'] = pd.qcut(df_imputed_cleaned['age'], q=num_bins, labels=False, duplicates='drop')

        # Splitting the binned data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Build a decision tree classifier
        clf = DecisionTreeClassifier(random_state=42)
        clf.fit(X_train, y_train)

        # Evaluate the decision tree's performance
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Update the best bins if the accuracy is higher
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_bins = num_bins

    # The best number of bins and its corresponding accuracy
    print("Best Number of Bins:", best_bins)
    print("Best Accuracy:", best_accuracy)

Best Number of Bins: 4
Best Accuracy: 0.9926297471593817


In [7]:
# Categorize age into 4 quantiles
df_imputed_cleaned['age'] = pd.qcut(df_imputed_cleaned['age'], q=4, labels=False, duplicates='drop')

In [8]:
# For fnlwgt

# Perform imputation using IterativeImputer
imputer = IterativeImputer(random_state=42)
df_imputed = pd.DataFrame(imputer.fit_transform(df_90), columns=df_90.columns)

# Remove missing values from all columns in the imputed DataFrame
df_imputed_cleaned = df_imputed.dropna()

# Check if the DataFrame has any remaining rows
if df_imputed_cleaned.shape[0] < 1:
    print("Insufficient data points after imputation and dropping missing values.")
else:
    # Splitting the data into features (X) and target (y)
    X = df_imputed_cleaned.drop(columns=['income'])  # Remove 'income' if it exists
    y = df_imputed_cleaned['income']

    best_accuracy = 0.0
    best_bins = None

    # Iterate over different numbers of bins
    for num_bins in range(4, 41):
        # Perform quantile binning on the 'fnlwgt' variable
        df_imputed_cleaned['fnlwgt_category'] = pd.qcut(df_imputed_cleaned['fnlwgt'], q=num_bins, labels=False, duplicates='drop')

        # Splitting the binned data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Build a decision tree classifier
        clf = DecisionTreeClassifier(random_state=42)
        clf.fit(X_train, y_train)

        # Evaluate the decision tree's performance
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Update the best bins if the accuracy is higher
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_bins = num_bins

    # The best number of bins and its corresponding accuracy
    print("Best Number of Bins for fnlwgt:", best_bins)
    print("Best Accuracy for fnlwgt:", best_accuracy)

Best Number of Bins for fnlwgt: 4
Best Accuracy for fnlwgt: 0.9926297471593817


In [9]:
# Categorize fnlwgt into 4 quantiles
df_90['fnlwgt'] = pd.qcut(df_90['fnlwgt'], q=4, labels=False, duplicates='drop')

In [10]:
# Categorize capital-gain and capital-loss with domain knowledge
df_90['capital-gain'] = df_90['capital-gain'].apply(lambda x: 0 if x == 0 else 1)
df_90['capital-loss'] = df_90['capital-loss'].apply(lambda x: 0 if x == 0 else 1)

In [11]:
df_90['age'].describe()

count    4885.00000
mean       38.59652
std        13.76627
min        17.00000
25%        27.00000
50%        37.00000
75%        48.00000
max        90.00000
Name: age, dtype: float64

In [12]:
# Categorize age with domain knowledge

# Define the age categories
age_bins = [17, 30, 40, 50, 60, 70, 90]
age_labels = [1, 2, 3, 4, 5, 6]  # Assign labels for the categories

# Categorize age
df_90['age'] = pd.cut(df_90['age'], bins=age_bins, labels=age_labels, include_lowest=True)

In [13]:
df_90['age'].describe()

count     4885
unique       6
top          1
freq      1590
Name: age, dtype: int64

In [14]:
df_90['hours-per-week'].describe()

count    4885.000000
mean       40.657728
std        12.641693
min         1.000000
25%        40.000000
50%        40.000000
75%        45.000000
max        99.000000
Name: hours-per-week, dtype: float64

In [15]:
df_90['hours-per-week'] = pd.cut(df_90['hours-per-week'], bins=[0, 20, 30, 40, 50, 100], labels=[1, 2, 3, 4, 5], right=False)

In [16]:
# Select the columns to be one-hot encoded (excluding 'income')
columns_to_encode = df_90.columns[df_90.columns != 'income']

# Perform one-hot encoding for each selected column
encoded_columns = []
for column in columns_to_encode:
    encoded = pd.get_dummies(df_90[column], dummy_na=True, prefix=column)
    encoded_columns.append(encoded)

# Concatenate the encoded columns with the original DataFrame
df_encoded = pd.concat([df_90.drop(columns=columns_to_encode), *encoded_columns], axis=1)

In [17]:
# Convert boolean values to 0/1 in the encoded DataFrame
df_encoded = df_encoded.astype(int)

In [18]:
df_encoded.head(10)

Unnamed: 0,income,age_1,age_2,age_3,age_4,age_5,age_6,age_nan,workclass_0.0,workclass_1.0,...,native-country_33.0,native-country_34.0,native-country_35.0,native-country_36.0,native-country_37.0,native-country_38.0,native-country_39.0,native-country_40.0,native-country_41.0,native-country_nan
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
8,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
9,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [19]:
# Get the column names as a list
column_list = df_encoded.columns.tolist()

# Print the list of column names
print(column_list)

['income', 'age_1', 'age_2', 'age_3', 'age_4', 'age_5', 'age_6', 'age_nan', 'workclass_0.0', 'workclass_1.0', 'workclass_2.0', 'workclass_3.0', 'workclass_4.0', 'workclass_5.0', 'workclass_6.0', 'workclass_7.0', 'workclass_nan', 'fnlwgt_0.0', 'fnlwgt_1.0', 'fnlwgt_2.0', 'fnlwgt_3.0', 'fnlwgt_nan', 'education_0.0', 'education_1.0', 'education_2.0', 'education_3.0', 'education_4.0', 'education_5.0', 'education_6.0', 'education_7.0', 'education_8.0', 'education_9.0', 'education_10.0', 'education_11.0', 'education_12.0', 'education_13.0', 'education_14.0', 'education_15.0', 'education_nan', 'educational-num_1.0', 'educational-num_2.0', 'educational-num_3.0', 'educational-num_4.0', 'educational-num_5.0', 'educational-num_6.0', 'educational-num_7.0', 'educational-num_8.0', 'educational-num_9.0', 'educational-num_10.0', 'educational-num_11.0', 'educational-num_12.0', 'educational-num_13.0', 'educational-num_14.0', 'educational-num_15.0', 'educational-num_16.0', 'educational-num_nan', 'marital

# Bayesian Statistics and MHA

In [26]:
# List of all one-hot encoded variables
all_variables = [
    ['age_1', 'age_2', 'age_3', 'age_4', 'age_5', 'age_6'], # age
    ['workclass_1.0', 'workclass_2.0', 'workclass_4.0', 'workclass_5.0', 'workclass_6.0', 'workclass_7.0'], # workclass
    ['fnlwgt_0.0', 'fnlwgt_1.0', 'fnlwgt_2.0', 'fnlwgt_3.0'], # fnlwgt
    ['education_0.0', 'education_1.0', 'education_2.0', 'education_3.0', 'education_4.0', 'education_5.0', 'education_6.0', 'education_7.0', 'education_8.0', 'education_9.0', 'education_10.0', 'education_11.0', 'education_12.0', 'education_13.0', 'education_14.0', 'education_15.0'], # education 
    ['educational-num_2.0', 'educational-num_3.0', 'educational-num_4.0', 'educational-num_5.0', 'educational-num_6.0', 'educational-num_7.0', 'educational-num_8.0', 'educational-num_9.0', 'educational-num_10.0', 'educational-num_11.0', 'educational-num_12.0', 'educational-num_13.0','educational-num_14.0', 'educational-num_15.0', 'educational-num_16.0'], # educational-num
    ['marital-status_1.0', 'marital-status_2.0', 'marital-status_3.0', 'marital-status_4.0', 'marital-status_5.0', 'marital-status_6.0'],
    ['occupation_1.0', 'occupation_2.0', 'occupation_3.0', 'occupation_4.0', 'occupation_5.0', 'occupation_6.0', 'occupation_7.0', 'occupation_8.0', 'occupation_9.0', 'occupation_10.0', 'occupation_11.0', 'occupation_12.0', 'occupation_13.0', 'occupation_14.0'],
    ['relationship_0.0', 'relationship_1.0', 'relationship_2.0', 'relationship_3.0', 'relationship_4.0', 'relationship_5.0'],
    ['capital-gain_0.0', 'capital-gain_1.0'],
    ['capital-loss_0.0', 'capital-loss_1.0'],
    ['hours-per-week_1', 'hours-per-week_2', 'hours-per-week_3', 'hours-per-week_4', 'hours-per-week_5']
]

In [27]:
# Assuming df_encoded is your DataFrame
df_imputed1 = df_encoded.copy()


# Create a dictionary to store the calculated probabilities for each variable
variable_probs = {}

# Loop over all variables
for variable_cols in all_variables:
    # Create a mask for rows with missing values in any of the current variable's columns
    mask = df_imputed1[variable_cols].isnull().any(axis=1)

    # Calculate the proportions of each category of the current variable based on the observed data
    probs = df_imputed1.loc[~mask, variable_cols].sum() / df_imputed1.loc[~mask, variable_cols].sum().sum()

    # Store the calculated proportions in the dictionary
    variable_probs[variable_cols[0]] = probs.values

        # Generate samples from a Dirichlet distribution using the calculated probabilities
    with pm.Model() as model:
        theta = pm.Dirichlet('theta', a=probs.values, shape=len(probs))
        variable_imputed = pm.Multinomial('variable', n=1, p=theta, shape=len(probs))

        step = pm.NUTS(target_accept=0.95)  # Explicitly specify NUTS sampler
        trace = pm.sample(5000, tune=1000, step=step, cores=1)

    # Calculate the mean of the samples to get the probabilities
    mean_probs = trace['variable'].mean(axis=0)

    # Create a mask for rows where all 'variable_cols' columns are zero
    zero_mask = (df_encoded[variable_cols] == 0).all(axis=1)

    # Replace the zeros with the calculated probabilities
    df_encoded.loc[zero_mask, variable_cols] = mean_probs

  return wrapped_(*args_, **kwargs_)
Sequential sampling (2 chains in 1 job)
CompoundStep
>NUTS: [theta]
>Metropolis: [variable]


Sampling 2 chains for 1_000 tune and 5_000 draw iterations (2_000 + 10_000 draws total) took 27 seconds.
The rhat statistic is larger than 1.05 for some parameters. This indicates slight problems during sampling.
The estimated number of effective samples is smaller than 200 for some parameters.
  return wrapped_(*args_, **kwargs_)
Sequential sampling (2 chains in 1 job)
CompoundStep
>NUTS: [theta]
>Metropolis: [variable]


Sampling 2 chains for 1_000 tune and 5_000 draw iterations (2_000 + 10_000 draws total) took 31 seconds.
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
The rhat statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.
  return wrapped_(*args_, **kwargs_)
Sequential sampling (2 chains in 1 job)
CompoundStep
>NUTS: [theta]
>Metropolis: [variable]


Sampling 2 chains for 1_000 tune and 5_000 draw iterations (2_000 + 10_000 draws total) took 12 seconds.
The rhat statistic is larger than 1.05 for some parameters. This indicates slight problems during sampling.
The estimated number of effective samples is smaller than 200 for some parameters.
  return wrapped_(*args_, **kwargs_)
Sequential sampling (2 chains in 1 job)
CompoundStep
>NUTS: [theta]
>Metropolis: [variable]


Sampling 2 chains for 1_000 tune and 5_000 draw iterations (2_000 + 10_000 draws total) took 360 seconds.
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
There were 760 divergences after tuning. Increase `target_accept` or reparameterize.
There were 1508 divergences after tuning. Increase `target_accept` or reparameterize.
The number of effective samples is smaller than 10% for some parameters.
  return wrapped_(*args_, **kwargs_)
Sequential sampling (2 chains in 1 job)
CompoundStep
>NUTS: [theta]
>Metropolis: [variable]


Sampling 2 chains for 1_000 tune and 5_000 draw iterations (2_000 + 10_000 draws total) took 87 seconds.
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
There were 23 divergences after tuning. Increase `target_accept` or reparameterize.
There were 409 divergences after tuning. Increase `target_accept` or reparameterize.
The number of effective samples is smaller than 10% for some parameters.
  return wrapped_(*args_, **kwargs_)
Sequential sampling (2 chains in 1 job)
CompoundStep
>NUTS: [theta]
>Metropolis: [variable]


Sampling 2 chains for 1_000 tune and 5_000 draw iterations (2_000 + 10_000 draws total) took 55 seconds.
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
There were 570 divergences after tuning. Increase `target_accept` or reparameterize.
There were 1154 divergences after tuning. Increase `target_accept` or reparameterize.
The rhat statistic is larger than 1.2 for some parameters.
The estimated number of effective samples is smaller than 200 for some parameters.
  return wrapped_(*args_, **kwargs_)
Sequential sampling (2 chains in 1 job)
CompoundStep
>NUTS: [theta]
>Metropolis: [variable]


Sampling 2 chains for 1_000 tune and 5_000 draw iterations (2_000 + 10_000 draws total) took 198 seconds.
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
There were 960 divergences after tuning. Increase `target_accept` or reparameterize.
There were 1963 divergences after tuning. Increase `target_accept` or reparameterize.
The number of effective samples is smaller than 10% for some parameters.
  return wrapped_(*args_, **kwargs_)
Sequential sampling (2 chains in 1 job)
CompoundStep
>NUTS: [theta]
>Metropolis: [variable]


Sampling 2 chains for 1_000 tune and 5_000 draw iterations (2_000 + 10_000 draws total) took 22 seconds.
  (between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
The rhat statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.
  return wrapped_(*args_, **kwargs_)
Sequential sampling (2 chains in 1 job)
CompoundStep
>NUTS: [theta]
>Metropolis: [variable]


Sampling 2 chains for 1_000 tune and 5_000 draw iterations (2_000 + 10_000 draws total) took 7 seconds.
The number of effective samples is smaller than 10% for some parameters.
  return wrapped_(*args_, **kwargs_)
Sequential sampling (2 chains in 1 job)
CompoundStep
>NUTS: [theta]
>Metropolis: [variable]


Sampling 2 chains for 1_000 tune and 5_000 draw iterations (2_000 + 10_000 draws total) took 6 seconds.
The number of effective samples is smaller than 10% for some parameters.
  return wrapped_(*args_, **kwargs_)
Sequential sampling (2 chains in 1 job)
CompoundStep
>NUTS: [theta]
>Metropolis: [variable]


Sampling 2 chains for 1_000 tune and 5_000 draw iterations (2_000 + 10_000 draws total) took 24 seconds.
The acceptance probability does not match the target. It is 0.9063808970436261, but should be close to 0.95. Try to increase the number of tuning steps.
The rhat statistic is larger than 1.05 for some parameters. This indicates slight problems during sampling.
The estimated number of effective samples is smaller than 200 for some parameters.


In [29]:
# Get the column names as a list
column_list = df_encoded.columns.tolist()

# Print the list of column names
print(column_list)

['income', 'age_1', 'age_2', 'age_3', 'age_4', 'age_5', 'age_6', 'age_nan', 'workclass_0.0', 'workclass_1.0', 'workclass_2.0', 'workclass_3.0', 'workclass_4.0', 'workclass_5.0', 'workclass_6.0', 'workclass_7.0', 'workclass_nan', 'fnlwgt_0.0', 'fnlwgt_1.0', 'fnlwgt_2.0', 'fnlwgt_3.0', 'fnlwgt_nan', 'education_0.0', 'education_1.0', 'education_2.0', 'education_3.0', 'education_4.0', 'education_5.0', 'education_6.0', 'education_7.0', 'education_8.0', 'education_9.0', 'education_10.0', 'education_11.0', 'education_12.0', 'education_13.0', 'education_14.0', 'education_15.0', 'education_nan', 'educational-num_1.0', 'educational-num_2.0', 'educational-num_3.0', 'educational-num_4.0', 'educational-num_5.0', 'educational-num_6.0', 'educational-num_7.0', 'educational-num_8.0', 'educational-num_9.0', 'educational-num_10.0', 'educational-num_11.0', 'educational-num_12.0', 'educational-num_13.0', 'educational-num_14.0', 'educational-num_15.0', 'educational-num_16.0', 'educational-num_nan', 'marital

In [33]:
# List of columns to delete
columns_to_delete = ['age_nan', 'workclass_nan', 'fnlwgt_nan', 'education_nan', 'educational-num_nan', 
                     'marital-status_nan', 'occupation_nan', 'relationship_nan', 'race_nan', 'gender_nan', 
                     'capital-gain_nan', 'capital-loss_nan', 'hours-per-week_nan', 'native-country_nan']

# Delete the columns from the DataFrame
df_encoded = df_encoded.drop(columns_to_delete, axis=1)

# Normal decision Tree

In [36]:
# Split the data into features (X) and target variable (y)
X = df_encoded.drop('income', axis=1)  # Assuming 'income' is the target variable
y = df_encoded['income']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a decision tree classifier
clf = DecisionTreeClassifier()

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Predict the income for the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision, recall, and F1 score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Calculate ROC AUC score
y_pred_prob = clf.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_prob)
print("ROC AUC Score:", roc_auc)

Accuracy: 0.7601596888115467
Precision: 0.4669987546699875
Recall: 0.16375545851528384
F1 Score: 0.2424830261881668
ROC AUC Score: 0.6906119375882749


In [38]:
from fuzzytree import FuzzyDecisionTreeClassifier

# Initialize the fuzzy decision tree classifier
clf_fuzz = FuzzyDecisionTreeClassifier().fit(X_train, y_train)

# Fit the classifier
clf_fuzz = FuzzyDecisionTreeClassifier().fit(X_train, y_train)

# Predict on the test set
y_pred = clf_fuzz.predict(X_test)



In [39]:
# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")

Accuracy: 0.7601596888115467
Precision: 0.4669987546699875
Recall: 0.16375545851528384
F1 Score: 0.2424830261881668
ROC AUC: 0.6906119375882749
