## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import math
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline as skPipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn. metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline

### Read data into DFs

In [None]:
# 1. Column names (15 features + label)
headers = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'pred'   # label (<=50K / >50K)
]

# 2. Read raw data
original_train_df = pd.read_csv(
    "census-income.data.csv",
    header=None,
    names=headers,
    index_col=False
)

original_test_df = pd.read_csv(
    "census-income.test.csv",
    header=None,
    names=headers,
    index_col=False
)

# Work on copies
train_df = original_train_df.copy()
test_df = original_test_df.copy()

In [None]:
#strip whitespace

def data_to_str(df):
    """
    Strip leading/trailing spaces from all string (object) columns.
    """
    str_cols = df.select_dtypes(include=['object']).columns
    for col in str_cols:
        df[col] = df[col].str.strip()
    return df

In [None]:
def strip_whitespace(df):
    
    # Remove trailing period if present
    df['pred'] = df['pred'].str.rstrip('.')

    return df

In [None]:
def replace_qmarks(df):
    
    # Convert "?" to missing values (NaN) in both dataframes
    df.replace("?", np.nan, inplace=True)

    return df

In [None]:
#convert numeric columns to integer types

def data_to_int32(df):
    """
    Convert known numeric columns to int32 (or Int64 if you want to allow NaN).
    """
    int_cols = [
        'age',
        'fnlwgt',
        'education-num',
        'capital-gain',
        'capital-loss',
        'hours-per-week'
    ]
    for col in int_cols:
        df[col] = df[col].astype('int32')
        
    return df

In [None]:
def preprocessing(df):

    df = data_to_str(df)
    df = strip_whitespace(df)
    df = replace_qmarks(df)
    df = data_to_int32(df)
    df = df.drop_duplicates()

    return df

In [None]:
train_df = preprocessing(train_df)
test_df = preprocessing(test_df)

In [None]:
chart_df = train_df.copy()

chart_df['pred'] = np.where(chart_df['pred'] == '>50K',1,0)

chart_df

In [None]:
def plot_prevalence_subplots(df, categories, pred_col='pred'):
    n = len(categories)
    rows = (n + 2) // 3          # auto-fit 3 plots per row
    cols = min(n, 3)

    fig, axes = plt.subplots(rows, cols, figsize=(6 * cols, 4 * rows))
    axes = axes.flatten() if n > 1 else [axes]

    for i, cat in enumerate(categories):
        ax = axes[i]
        
        prevalence = df.groupby(cat)[pred_col].mean()

        prevalence.plot(kind='bar', ax=ax)

        ax.set_title(f"Target prevalence by '{cat}' group")
        ax.set_ylabel("Proportion with target = 1")
        ax.set_xlabel(cat)

    # Hide unused subplots (if any)
    for j in range(i+1, len(axes)):
        axes[j].set_visible(False)

    plt.tight_layout()
    plt.show()

In [None]:
cats = ['workclass','education','marital-status','occupation','relationship','hours-per-week']

plot_prevalence_subplots(chart_df, cats)

## Recategorize the Categorical features into more meaningful groups

In [None]:
def category_remapping(df):

    copy_df = df.copy()
    
    workclass_mapping = {
        'State-gov':'Government',
        'Local-gov':'Government',
        'Federal-gov':'Government',
        'Self-emp-inc':'Incorporated-Entrepreneur',
        'Self-emp-not-inc':'Unincorporated-Entrepreneur',
        'Without-pay':'Unemployed',
        'Never-worked':'Unemployed',
        'Private':'Private'
    }
    
    # Apply mapping and insert into dataframe
    copy_df.insert(2,'workclass-cat',copy_df['workclass'].map(workclass_mapping))
    
    edu_mapping = {
        'Preschool':'HS-dropout',
        '1st-4th':'HS-dropout',
        '5th-6th':'HS-dropout',
        '7th-8th':'HS-dropout',
        '9th':'HS-dropout',
        '10th':'HS-dropout',
        '11th':'HS-dropout',
        '12th':'HS-dropout',
        'HS-grad':'HS-grad',
        'Some-college':'Some-college',
        'Assoc-acdm':'Some-college',
        'Assoc-voc':'Some-college',
        'Bachelors':'Bachelors',
        'Masters':'Advanced-degree',
        'Prof-school':'Advanced-degree',
        'Doctorate':'Advanced-degree'
    }
    
    copy_df.insert(4,"education-cat",copy_df['education'].map(edu_mapping))
    
    marital_mapping = {
        'Never-married':'Single/Unmarried',
        'Divorced':'Single/Unmarried',
        'Separated':'Single/Unmarried',
        'Widowed':'Single/Unmarried',
        'Married-spouse-absent':'Single/Unmarried',
        'Married-civ-spouse':'Married',
        'Married-AF-spouse':'Married'
    }
    
    copy_df.insert(7,'marital-cat',copy_df['marital-status'].map(marital_mapping))

    occupation_mapping = {
        'Exec-managerial':'White-collar',
        'Prof-specialty':'White-collar',
        'Tech-support':'White-collar',
        
        'Other-service':'Service',
        'Sales':'Service',
        'Adm-clerical':'Service',
        'Protective-serv':'Service',
        
        'Craft-repair':'Blue-collar',
        'Transport-moving':'Blue-collar',
        'Machine-op-inspct':'Blue-collar',
    
        'Armed-Forces':'Military',
    
        'Priv-house-serv':'Manual',
        'Farming-fishing':'Manual',
        'Handlers-cleaners':'Manual'
    }
    
    copy_df.insert(10,'occupation-cat',copy_df['occupation'].map(occupation_mapping))

    s = copy_df['native-country']
    
    native_imm_cat = (
        s.map({'United-States': 'Native'})         # US → Native, others → NaN
         .fillna('Immigrant')                     # non-US, non-missing → Immigrant
         .where(s.notna(), pd.NA)                 # where original was missing, keep NA
    )
    
    native_imm_cat.replace({pd.NA:np.nan}, inplace = True)
    
    copy_df.insert(18, 'native_imm_cat', native_imm_cat)

    hrs_bins = [0, 30, 40, 60, 100]
    hrs_labels = ['Part-Time', 'Underworked', 'Full-Time+', 'Overworked']
    
    copy_df['hours_bin'] = pd.cut(
        copy_df['hours-per-week'], 
        bins=hrs_bins, 
        labels=hrs_labels
    )

    # drop all adjusted categorical features in favor of their derived categories
    copy_df.drop(['workclass','fnlwgt','education','education-num','marital-status','occupation','native-country','hours-per-week'], axis = 1, inplace = True)

    return copy_df

In [None]:
train_df = category_remapping(train_df)
test_df = category_remapping(test_df)

chart_df = train_df.copy()

chart_df['pred'] = np.where(chart_df['pred'] == '>50K',1,0)

In [None]:
cats = ['workclass-cat','education-cat','marital-cat','occupation-cat','native_imm_cat','hours_bin']

plot_prevalence_subplots(chart_df, cats)

## Missing Values

In [None]:
train_df.isna().sum()

In [None]:
test_df.isna().sum()

In [None]:
#Columns that have missing values
columns = {'workclass-cat', 'occupation-cat', 'native_imm_cat'}
columns_test = {'workclass-cat', 'occupation-cat', 'native_imm_cat'}

In [None]:
#Print only the columns that contain NA values
for col in columns:
    if train_df[col].isna().any():
        print(f"\n{col}")
        print(train_df[col].unique())

In [None]:
##### TRAIN
#Imputation: mode - 2
train_df_mode = train_df.copy()

for col in columns:
    train_df_mode[col] = train_df_mode[col].fillna(train_df_mode[col].mode()[0])

train_df_mode.isna().sum()

In [None]:
#### TEST
#Imputation: mode - 2
test_df_mode = test_df.copy()

for col in columns_test:
    test_df_mode[col] = test_df_mode[col].fillna(test_df_mode[col].mode()[0])

test_df_mode.isna().sum()

## Correlational Matrix

In [None]:
#OHE for numerical features only

# Select numeric columns only
#numeric_df = train_df.select_dtypes(include=['int32', 'int64', 'float64'])
numeric_df = train_df_mode.select_dtypes(include=['int32', 'int64', 'float64'])

corr_matrix = numeric_df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Matrix of Numeric Features")
plt.show()

In [None]:
#OHE for correlation matrix

# One-hot encode categorical features
#encoded_df = pd.get_dummies(train_df.drop(columns=['pred']), drop_first=True)
encoded_df = pd.get_dummies(train_df_mode.drop(columns=['pred']), drop_first=True)

# Add encoded label for correlation study
#encoded_df['target'] = train_df['pred'].apply(lambda x: 1 if x == '>50K' else 0)
encoded_df['target'] = train_df_mode['pred'].apply(lambda x: 1 if x == '>50K' else 0)

corr = encoded_df.corr()

plt.figure(figsize=(16, 12))
sns.heatmap(corr, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Full Correlation Matrix (After One-Hot Encoding)")
plt.show()

In [None]:
# Convert pred to binary
#binary_corr_df = train_df.copy()
binary_corr_df = train_df_mode.copy()
binary_corr_df['target'] = binary_corr_df['pred'].apply(lambda x: 1 if x == '>50K' else 0)

# One-hot encode ALL features except target
encoded = pd.get_dummies(binary_corr_df.drop(columns=['pred']), drop_first=True)

# Compute correlation with target only
corr_target = encoded.corr()['target'].sort_values(ascending=False)

corr_target

In [None]:
plt.figure(figsize=(6,10))
sns.heatmap(corr_target.to_frame(), annot=True, cmap='coolwarm')
plt.title("Correlation of Each Feature with Target")
plt.show()

In [None]:
# Remove native-country
encoded_filtered = encoded.drop(columns=[col for col in encoded.columns 
                                         if col.startswith("native-country_")])

# Remove numeric columns
numeric_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain',
                'capital-loss', 'target']  
encoded_filtered = encoded_filtered.drop(columns=[col for col in numeric_cols 
                                                  if col in encoded_filtered.columns])

# Compute correlation with target only (SORTED DESCENDING)
corr_target_only = encoded_filtered.join(binary_corr_df['target']) \
                                   .corr()['target'] \
                                   .sort_values(ascending=False)


In [None]:
plt.figure(figsize=(6, 20))
sns.heatmap(corr_target_only.to_frame(),
            annot=True,
            cmap='coolwarm',
            vmin=-0.4,
            vmax=0.45)
plt.title("Categorical Feature Correlations With Target")
plt.show()

In [None]:
# Full correlation matrix among remaining categorical columns
cat_corr_matrix = encoded_filtered.corr()

In [None]:
plt.figure(figsize=(22, 20))
sns.heatmap(cat_corr_matrix, cmap='coolwarm', center=0)
plt.title("Correlation Matrix of Categorical Features")
plt.show()

## Normalization --- Jieun's Part

capital-gain and capital-loss variables are extremely right-skewed with heavy zeros and a few large outliers. Standard normalization (like Min–Max or Z-score) will not work well.

In [None]:
train_df_mode

In [None]:
#Log transformation (best for heavy right-skew)
#log_train_df = train_df.copy()
log_train_df = df_mode.copy()

log_train_df['capital_gain_log'] = np.log1p(log_train_df['capital-gain'])
log_train_df['capital_loss_log'] = np.log1p(log_train_df['capital-loss'])

#log1p(x) handles zeros safely.
#Compresses extreme values.
#Spreads out dense low-value regions.

In [None]:
##for the testing data

#log_train_df = train_df.copy()
log_test_df = test_df_mode.copy()

log_test_df['capital_gain_log'] = np.log1p(log_test_df['capital-gain'])
log_test_df['capital_loss_log'] = np.log1p(log_test_df['capital-loss'])


log_test_df
print(log_test_df['capital_gain_log'].min())
print(log_test_df['capital_gain_log'].max())
print(log_train_df['capital_gain_log'].min())
print(log_train_df['capital_gain_log'].max())

In [None]:
#capital gain
plt.subplot(1, 2, 1)
plt.hist(log_train_df['capital_gain_log'], bins=5)
plt.title("Capital_gain_log")
plt.xlabel("capital_gain_log")
plt.ylabel("Count")

#capital loss
plt.subplot(1, 2, 2)
plt.hist(log_train_df['capital_loss_log'], bins=5)
plt.title("Capital_loss_log")
plt.xlabel("capital_loss_log")
plt.ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
log_train_df

In [None]:
X_train= log_train_df.drop (['pred', 'capital-gain','capital-loss'],axis=1)
Y_train= log_train_df['pred']

X_test = log_test_df.drop(['pred', 'capital-gain','capital-loss'], axis=1)
Y_test = log_test_df['pred']
#X_test.isna().sum()

In [None]:

cat_cols = X_train.select_dtypes(include=["object","category"]).columns.tolist()
print(X_train.columns.tolist())
print(X_test.columns.tolist())
print(train_df['hours_bin'].value_counts())

In [None]:
from imblearn.pipeline import Pipeline
imbalances = {
    "none" : "passthrough",
    "under" : RandomUnderSampler(random_state=42),
    "over" : RandomOverSampler(random_state=42)
}
preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
], remainder="passthrough")

models = {
    "Decision Tree" : DecisionTreeClassifier(random_state=42),
    "Random Forest Classifier" : RandomForestClassifier(random_state=42),
    "Logistic Regression" : LogisticRegression(max_iter=5000,solver="lbfgs"),
    "Bagged Decision Tree": BalancedBaggingClassifier(
        estimator = DecisionTreeClassifier(random_state=42), ## can make more for each 
        n_estimators= 50, 
        sampling_strategy='auto', 
        replacement = False, 
        random_state=42,
        n_jobs=1
    ),
    "Bagged Random Forest": BalancedBaggingClassifier(
        estimator = RandomForestClassifier(random_state=42), ## can make more for each 
        n_estimators= 50, 
        sampling_strategy='auto', 
        replacement = False, 
        random_state=42,
        n_jobs=1
    )
}

for imbalance_methods, method in imbalances.items():
    for model_name, clf in models.items():
        if (model_name in ["Bagged Decision Tree", "Bagged Random Forest"]) and imbalance_methods != "none":
            continue
        print (f"\nImbalance Method: {imbalance_methods} \nModel: {model_name} ")

        if model_name == "Bagged Decision Tree":
                   pipe = Pipeline(steps=[
                       ("encode", preprocess),
                       ("model", clf)
                   ])
        elif model_name == "Bagged Random Forest":
                   pipe = Pipeline(steps=[
                       ("encode", preprocess),
                       ("model", clf)
                   ])
        else: 
                   pipe = Pipeline (steps=[
                       ("encode", preprocess),
                       ("imbalances", method),
                       ("model", clf)
                   ])
        pipe.fit(X_train, Y_train)
        y_pred = pipe.predict(X_test)

        print ("test label distribution:", np.unique(Y_test, return_counts=True))
        print ("predicted label distribution:", np.unique (y_pred, return_counts=True))
        print (classification_report(Y_test, y_pred))
        
        
        
            