In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, precision_score, recall_score, f1_score
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [2]:
# Load the CSV file
df = pd.read_csv(r'C:\Programmieren\Jupyter Lab\Datasets\adult.csv')

In [3]:
# Create a LabelEncoder object
le = LabelEncoder()

# List of attributes to encode
attributes_to_encode = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']

# Apply the encoder to each attribute
for attribute in attributes_to_encode:
    df[attribute] = le.fit_transform(df[attribute])

# Display the DataFrame
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,4,226802,1,7,4,7,3,2,1,0,0,40,39,0
1,38,4,89814,11,9,2,5,0,4,1,0,0,50,39,0
2,28,2,336951,7,12,2,11,0,4,1,0,0,40,39,1
3,44,4,160323,15,10,2,7,0,2,1,7688,0,40,39,1
4,18,0,103497,15,10,4,0,3,4,0,0,0,30,39,0


In [4]:
# Defining the uncertain features
uncertain_features = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status', 'occupation',
                      'relationship', 'capital-gain', 'capital-loss', 'hours-per-week']

# List of thresholds
thresholds = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# A dictionary to store the modified dataframes
modified_dfs = {}

# Inserting missing values
for thresh in thresholds:
    # Create a copy of the original dataframe
    df_copy = df.copy()
    
    # For each uncertain feature, set a proportion of the data to NaN at random
    for feature in uncertain_features:
        # Find the number of entries to set to NaN
        num_nan = int(thresh * len(df_copy[feature]))
        
        # Generate random indices for NaN insertion
        nan_indices = np.random.choice(df_copy.index, num_nan, replace=False)
        
        # Insert NaNs
        df_copy.loc[nan_indices, feature] = np.nan
    
    # Store the modified dataframe in the dictionary
    modified_dfs[f"df_{int(thresh*100)}"] = df_copy

In [5]:
# A dictionary to store the imputed dataframes
imputed_dfs = {}

# Imputing missing values
for key, mod_df in modified_dfs.items():
    # Create copies of the modified dataframe for each imputation method
    df_mean = mod_df.copy()
    df_median = mod_df.copy()
    df_mode = mod_df.copy()
    
    for feature in uncertain_features:
        # Impute missing values with mean
        df_mean[feature].fillna(df_mean[feature].mean(), inplace=True)
        
        # Impute missing values with median
        df_median[feature].fillna(df_median[feature].median(), inplace=True)
        
        # Impute missing values with mode
        df_mode[feature].fillna(df_mode[feature].mode()[0], inplace=True)
    
    # Store the imputed dataframes in the dictionary
    imputed_dfs[f"{key}_mean"] = df_mean
    imputed_dfs[f"{key}_median"] = df_median
    imputed_dfs[f"{key}_mode"] = df_mode

In [8]:
# Initialize an empty DataFrame to store the results
results = pd.DataFrame(columns=['DataFrame', 'Missing', 'Imputation', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC'])

# Create an index counter
idx = 0

# For each DataFrame in the dictionary
for key, imp_df in imputed_dfs.items():
    # Split the DataFrame into features and target
    X = imp_df.drop(target, axis=1)
    y = imp_df[target]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train the decision tree
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = dt.predict(X_test)

    # Compute the metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, dt.predict_proba(X_test)[:, 1])

    # Parse the key to get the percentage of missing values and the imputation method
    missing, imputation = key.split('_')[1], key.split('_')[2]

    # Add the results to the results DataFrame
    results.loc[idx] = [key, missing, imputation, accuracy, precision, recall, f1, roc_auc]
    
    # Increment the index counter
    idx += 1

# To export the results to an Excel file
results.to_excel('decision_tree_results.xlsx', index=False)

In [11]:
from sklearn.model_selection import cross_val_score

# Initialize an empty DataFrame to store the results
results = pd.DataFrame(columns=['DataFrame', 'Missing', 'Imputation', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC'])

# Create an index counter
idx = 0

# For each DataFrame in the dictionary
for key, imp_df in imputed_dfs.items():
    # Split the DataFrame into features and target
    X = imp_df.drop(target, axis=1)
    y = imp_df[target]
    
    # Create and train the decision tree
    dt = DecisionTreeClassifier()
    
    # Compute the cross-validated metrics
    accuracy = cross_val_score(dt, X, y, cv=5, scoring='accuracy').mean()
    precision = cross_val_score(dt, X, y, cv=5, scoring='precision_weighted').mean()
    recall = cross_val_score(dt, X, y, cv=5, scoring='recall_weighted').mean()
    f1 = cross_val_score(dt, X, y, cv=5, scoring='f1_weighted').mean()
    roc_auc = cross_val_score(dt, X, y, cv=5, scoring='roc_auc').mean()

    # Parse the key to get the percentage of missing values and the imputation method
    missing, imputation = key.split('_')[1], key.split('_')[2]

    # Add the results to the results DataFrame
    results.loc[idx] = [key, missing, imputation, accuracy, precision, recall, f1, roc_auc]
    
    # Increment the index counter
    idx += 1

In [12]:
results

Unnamed: 0,DataFrame,Missing,Imputation,Accuracy,Precision,Recall,F1-Score,ROC AUC
0,df_5_mean,5,mean,0.806171,0.808141,0.805577,0.807294,0.738176
1,df_5_median,5,median,0.803571,0.807907,0.804512,0.8065,0.738133
2,df_5_mode,5,mode,0.807543,0.810394,0.808136,0.808945,0.74224
3,df_10_mean,10,mean,0.801175,0.803067,0.800704,0.802074,0.732059
4,df_10_median,10,median,0.800274,0.802511,0.798125,0.800007,0.730354
5,df_10_mode,10,mode,0.802998,0.804513,0.801912,0.802982,0.732276
6,df_20_mean,20,mean,0.793334,0.797105,0.792351,0.794656,0.727412
7,df_20_median,20,median,0.791184,0.793935,0.791163,0.792131,0.71967
8,df_20_mode,20,mode,0.79749,0.797972,0.797326,0.798349,0.727076
9,df_30_mean,30,mean,0.781172,0.783078,0.781336,0.781324,0.706969


In [14]:
results.to_excel('cross_validation_results.xlsx', index=False)