In [1]:
import pandas as pd

In [2]:
# Load the dataset
data = pd.read_csv("processed_bank_data.csv")

In [3]:
data['housing'].value_counts()

housing
 1    21376
 0    18427
-1      984
Name: count, dtype: int64

In [4]:
data['loan'].value_counts()

loan
 0    33620
 1     6183
-1      984
Name: count, dtype: int64

In [5]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

# Load the dataset
df = pd.read_csv("processed_bank_data.csv")

# Check initial counts of missing or unknown values
initial_housing_unknowns = sum(df['housing'] == -1)
initial_loan_unknowns = sum(df['loan'] == -1)

print(f"Initial unknown values in 'housing': {initial_housing_unknowns}")
print(f"Initial unknown values in 'loan': {initial_loan_unknowns}")

# Function for conditional imputation using a Decision Tree Classifier
def conditional_imputation(df, target_column, feature_columns, condition_value):
    """
    Imputes a target column based on a condition value (e.g., -1 for 'housing', 0 for 'poutcome').
    """
    # Separate rows with missing/unknown target values and rows with known target values
    df_condition = df[df[target_column] == condition_value]
    df_non_condition = df[df[target_column] != condition_value]

    if df_condition.empty:
        print(f"No values matching {condition_value} in {target_column} for imputation.")
        return df

    # Prepare the features and target for training the model
    X_train = df_non_condition[feature_columns]
    y_train = df_non_condition[target_column]

    # Initialize the DecisionTreeClassifier and fit it on the training data
    clf = DecisionTreeClassifier(max_depth=5, random_state=42)
    clf.fit(X_train, y_train)

    # Use the model to predict values for rows matching the condition
    X_condition = df_condition[feature_columns]
    predicted_values = clf.predict(X_condition)

    # Replace condition values with the predicted values
    df.loc[df[target_column] == condition_value, target_column] = predicted_values

    return df

# Features used for imputation
feature_columns = ['age', 'marital_single', 'marital_married', 'marital_divorced',
                   'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management',
                   'job_retired', 'job_self-employed', 'job_services', 'job_student',
                   'job_technician', 'job_unemployed', 'education', 'default', 'contact',
                   'month', 'day_of_week', 'campaign', 'pdays', 'previous', 'poutcome',
                   'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

# Imputation for 'housing' (unknown values represented by -1)
df = conditional_imputation(df, 'housing', feature_columns, -1)

# Imputation for 'loan' (unknown values represented by -1)
df = conditional_imputation(df, 'loan', feature_columns, -1)

# Verify the results
remaining_housing_unknowns = sum(df['housing'] == -1)
remaining_loan_unknowns = sum(df['loan'] == -1)

print(f"Unknown values in 'housing' after imputation: {remaining_housing_unknowns}")
print(f"Unknown values in 'loan' after imputation: {remaining_loan_unknowns}")

Initial unknown values in 'housing': 984
Initial unknown values in 'loan': 984
Unknown values in 'housing' after imputation: 0
Unknown values in 'loan' after imputation: 0


In [25]:
# Save the updated data
df.to_csv("processed_bank_data_final.csv", index=False)

# Final confirmation
print("Imputation for 'housing',  and 'loan' complete. The dataset has been saved to 'processed_bank_data_final.csv'.")

Imputation for 'housing',  and 'loan' complete. The dataset has been saved to 'processed_bank_data_final.csv'.


In [6]:
# Load the dataset
data = pd.read_csv("processed_bank_data_final.csv")

data['housing'].value_counts()

housing
1    22050
0    18737
Name: count, dtype: int64

In [7]:
data['loan'].value_counts()

loan
0    34604
1     6183
Name: count, dtype: int64

In [11]:
data = pd.read_csv("processed_bank_data_final.csv")

# Specify all columns to be converted to 'category', including one-hot encoded columns
categorical_columns = ['default', 'housing', 'loan', 'contact', 'month',
                       'day_of_week', 'poutcome', 'education',
                       'marital_single', 'marital_married', 'marital_divorced']

# Add the one-hot encoded job columns to the list
job_columns = [col for col in data.columns if col.startswith('job_')]
categorical_columns.extend(job_columns)

# Convert all specified columns to categorical dtype
data[categorical_columns] = data[categorical_columns].astype('category')

# Verify the changes
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40787 entries, 0 to 40786
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   age                40787 non-null  int64   
 1   job_blue-collar    40787 non-null  category
 2   job_entrepreneur   40787 non-null  category
 3   job_housemaid      40787 non-null  category
 4   job_management     40787 non-null  category
 5   job_retired        40787 non-null  category
 6   job_self-employed  40787 non-null  category
 7   job_services       40787 non-null  category
 8   job_student        40787 non-null  category
 9   job_technician     40787 non-null  category
 10  job_unemployed     40787 non-null  category
 11  marital_single     40787 non-null  category
 12  marital_married    40787 non-null  category
 13  marital_divorced   40787 non-null  category
 14  education          40787 non-null  category
 15  default            40787 non-null  category
 16  hous