<a href="https://colab.research.google.com/github/LahiWeer/2330902_20230537_CM2604-ML_CW/blob/master/Preprocessing/S3%20-%20Unknown_values_imputation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load the dataset
data_url = 'https://raw.githubusercontent.com/LahiWeer/2330902_20230537_CM2604-ML_CW/refs/heads/master/Preprocessing/Processed%20data%20in%20each%20step/processed_bank_data.csv'
df = pd.read_csv(data_url)

In [3]:
df['housing'].value_counts()

Unnamed: 0_level_0,count
housing,Unnamed: 1_level_1
1,21376
0,18427
-1,984


In [None]:
df['loan'].value_counts()

Unnamed: 0_level_0,count
loan,Unnamed: 1_level_1
0,33620
1,6183
-1,984


In [None]:
# Check initial counts of missing or unknown values
initial_housing_unknowns = sum(df['housing'] == -1)
initial_loan_unknowns = sum(df['loan'] == -1)

print(f"Initial unknown values in 'housing': {initial_housing_unknowns}")
print(f"Initial unknown values in 'loan': {initial_loan_unknowns}")

Initial unknown values in 'housing': 984
Initial unknown values in 'loan': 984


### **Conditional imputation using Decision Tree Classifier**
- Impute unknown values in 'housing' and 'loan' based on related feature columns using a Decision Tree Classifier.


In [None]:
# Function for conditional imputation using a Decision Tree Classifier
def conditional_imputation(df, target_column, feature_columns, condition_value):
    """
    Imputes a target column based on a condition value (e.g., -1 for 'housing', 0 for 'poutcome').
    """
    # Separate rows with missing/unknown target values and rows with known target values
    df_condition = df[df[target_column] == condition_value]
    df_non_condition = df[df[target_column] != condition_value]

    if df_condition.empty:
        print(f"No values matching {condition_value} in {target_column} for imputation.")
        return df

    # Prepare the features and target for training the model
    X_train = df_non_condition[feature_columns]
    y_train = df_non_condition[target_column]

    # Initialize the DecisionTreeClassifier and fit it on the training data
    clf = DecisionTreeClassifier(max_depth=5, random_state=42)
    clf.fit(X_train, y_train)

    # Use the model to predict values for rows matching the condition
    X_condition = df_condition[feature_columns]
    predicted_values = clf.predict(X_condition)

    # Replace condition values with the predicted values
    df.loc[df[target_column] == condition_value, target_column] = predicted_values

    return df

# Features used for imputation
feature_columns = ['age', 'marital_single', 'marital_married', 'marital_divorced',
                   'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management',
                   'job_retired', 'job_self-employed', 'job_services', 'job_student',
                   'job_technician', 'job_unemployed', 'education', 'default', 'contact',
                   'month', 'day_of_week', 'campaign', 'pdays', 'previous', 'poutcome',
                   'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

# Imputation for 'housing' (unknown values represented by -1)
df = conditional_imputation(df, 'housing', feature_columns, -1)

# Imputation for 'loan' (unknown values represented by -1)
df = conditional_imputation(df, 'loan', feature_columns, -1)

No values matching -1 in housing for imputation.
No values matching -1 in loan for imputation.


In [None]:
# Verify the imputation results
remaining_housing_unknowns = sum(df['housing'] == -1)
remaining_loan_unknowns = sum(df['loan'] == -1)

print(f"Unknown values in 'housing' after imputation: {remaining_housing_unknowns}")
print(f"Unknown values in 'loan' after imputation: {remaining_loan_unknowns}")

Unknown values in 'housing' after imputation: 0
Unknown values in 'loan' after imputation: 0


In [None]:
# Save the updated data
df.to_csv("processed_bank_data_final.csv", index=False)

# Final confirmation
print("Imputation for 'housing',  and 'loan' complete. The dataset has been saved to 'processed_bank_data_final.csv'.")

Imputation for 'housing',  and 'loan' complete. The dataset has been saved to 'processed_bank_data_final.csv'.


In [4]:
# Load the updated dataset
data_url = 'https://raw.githubusercontent.com/LahiWeer/2330902_20230537_CM2604-ML_CW/refs/heads/master/Preprocessing/Processed%20data%20in%20each%20step/processed_bank_data_final.csv'
data = pd.read_csv(data_url)

In [5]:
# Check value counts for 'housing' column in the updated dataset
data['housing'].value_counts()

Unnamed: 0_level_0,count
housing,Unnamed: 1_level_1
1,22050
0,18737


In [6]:
#*Check value counts for 'loan' column in the updated dataset
data['loan'].value_counts()

Unnamed: 0_level_0,count
loan,Unnamed: 1_level_1
0,34604
1,6183
