## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import math
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn. metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline

### Read data into DFs

In [None]:
# 1. Column names (15 features + label)
headers = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'pred'   # label (<=50K / >50K)
]

# 2. Read raw data
original_train_df = pd.read_csv(
    "census-income.data.csv",
    header=None,
    names=headers,
    index_col=False
)

original_test_df = pd.read_csv(
    "census-income.test.csv",
    header=None,
    names=headers,
    index_col=False
)

# Work on copies
train_df = original_train_df.copy()
test_df = original_test_df.copy()

train_df.head()

In [None]:
test_df.head()

In [None]:
#strip whitespace

def data_to_str(df):
    """
    Strip leading/trailing spaces from all string (object) columns.
    """
    str_cols = df.select_dtypes(include=['object']).columns
    for col in str_cols:
        df[col] = df[col].str.strip()
    return df

# Apply to both train and test
train_df = data_to_str(train_df)
test_df = data_to_str(test_df)

In [None]:
# Remove trailing period if present
test_df['pred'] = test_df['pred'].str.rstrip('.')

In [None]:
# Convert "?" to missing values (NaN) in both dataframes
train_df.replace("?", np.nan, inplace=True)
test_df.replace("?", np.nan, inplace=True)

# Sanity check: how many missing values per column?
train_df.isna().sum()

In [None]:
test_df.isna().sum()

In [None]:
# Before dropping
print("Train shape before dropping NaN:", train_df.shape)
print("Test shape before dropping NaN:", test_df.shape)

In [None]:
# Drop rows with any NaN
#train_df = train_df.dropna()
#test_df = test_df.dropna()

# After dropping
#print("Train shape after dropping NaN:", train_df.shape)
#print("Test shape after dropping NaN:", test_df.shape)

In [None]:
#convert numeric columns to integer types

def data_to_int32(df):
    """
    Convert known numeric columns to int32 (or Int64 if you want to allow NaN).
    """
    int_cols = [
        'age',
        'fnlwgt',
        'education-num',
        'capital-gain',
        'capital-loss',
        'hours-per-week'
    ]
    for col in int_cols:
        df[col] = df[col].astype('int32')
    return df

train_df = data_to_int32(train_df)
test_df = data_to_int32(test_df)

train_df.dtypes

In [None]:
#removing duplicate rows

print("Train duplicates:", train_df.duplicated().sum())
print("Test duplicates:", test_df.duplicated().sum())

train_df = train_df.drop_duplicates()
#test_df = test_df.drop_duplicates()

print("Train shape after dedup:", train_df.shape)
#print("Test shape after dedup:", test_df.shape)

## Exploratory Data Analysis

In [None]:
bar_charts = [
    'sex','race','workclass','marital-status','occupation',
    'relationship','education','education-num','native-country'
]

n = len(bar_charts)
cols = 3
rows = (n + cols - 1) // cols

fig, axes = plt.subplots(rows, cols, figsize=(12, 4 * rows))
axes = axes.flatten()

for i, column in enumerate(bar_charts):
    s = train_df[column]

    if pd.api.types.is_numeric_dtype(s):
        axes[i].hist(s.dropna(), bins=20)
    else:
        counts = s.value_counts(dropna=False)
        axes[i].bar(range(len(counts)), counts.values)
        axes[i].set_xticks(range(len(counts)))
        axes[i].set_xticklabels(counts.index.astype(str), rotation=90)

    axes[i].set_title(f"Distribution of {column}")
    axes[i].set_xlabel(column)
    axes[i].set_ylabel("Count")

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
hist_charts = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

cols = 2
rows = 3

fig, axes = plt.subplots(rows, cols, figsize=(10, 4 * rows))
axes = axes.flatten()   
for i, column in enumerate(hist_charts):
    ax = axes[i]

    # Drop NA values for safety
    data = train_df[column].dropna()

    ax.hist(data, bins=20, edgecolor='black')
    
    ax.set_title(f"Histogram of {column}")
    ax.set_xlabel(column)
    ax.set_ylabel("Count")

# Remove ANY unused subplot axes entirely
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

## Recategorize the Categorical features into more meaningful groups

In [None]:
workclass_mapping = {
    'State-gov':'Government',
    'Local-gov':'Government',
    'Federal-gov':'Government',
    'Self-emp-inc':'Entrepreneur',
    'Self-emp-not-inc':'Entrepreneur',
    'Without-pay':'Unemployed',
    'Never-worked':'Unemployed',
    'Private':'Private',
    np.nan:'Unknown'
}

# Apply mapping and insert into dataframe
train_df.insert(2,'workclass-cat',train_df['workclass'].map(workclass_mapping))

edu_mapping = {
    'Preschool':'HS-dropout',
    '1st-4th':'HS-dropout',
    '5th-6th':'HS-dropout',
    '7th-8th':'HS-dropout',
    '9th':'HS-dropout',
    '10th':'HS-dropout',
    '11th':'HS-dropout',
    '12th':'HS-dropout',
    'HS-grad':'HS-grad',
    'Some-college':'Some-college',
    'Assoc-acdm':'Some-college',
    'Assoc-voc':'Some-college',
    'Bachelors':'Bachelors',
    'Masters':'Advanced-degree',
    'Prof-school':'Advanced-degree',
    'Doctorate':'Advanced-degree',
    np.nan:'Unknown'
}

train_df.insert(4,"education-cat",train_df['education'].map(edu_mapping))

marital_mapping = {
    'Never-married':'Single/Unmarried',
    'Divorced':'Single/Unmarried',
    'Separated':'Single/Unmarried',
    'Widowed':'Single/Unmarried',
    'Married-spouse-absent':'Single/Unmarried',
    'Married-civ-spouse':'Married',
    'Married-AF-spouse':'Married',
    np.nan:'Unknown'
}

train_df.insert(7,'marital-cat',train_df['marital-status'].map(marital_mapping))

occupation_mapping = {
    'Exec-managerial':'White-collar',
    'Prof-specialty':'White-collar',
    'Tech-support':'White-collar',
    
    'Other-service':'Service',
    'Sales':'Service',
    'Adm-clerical':'Service',
    'Protective-serv':'Service',
    
    'Craft-repair':'Blue-collar',
    'Transport-moving':'Blue-collar',
    'Machine-op-inspct':'Blue-collar',

    'Armed-Forces':'Military',

    'Priv-house-serv':'Manual',
    'Farming-fishing':'Manual',
    'Handlers-cleaners':'Manual',

    np.nan:'Unknown'
}

train_df.insert(10,'occupation-cat',train_df['occupation'].map(occupation_mapping))

train_df.insert(18,'native_imm_cat',pd.Series(np.where(train_df['native-country'] == 'United-States', 'Native', 'Immigrant')))

# drop all adjusted categorical features in favor of their derived categories
train_df.drop(['workclass','fnlwgt','education','education-num','marital-status','occupation','native-country'], axis = 1, inplace = True)

In [None]:
train_df

## One Hot Encoding and Correlation Matrix

In [None]:
#OHE for numerical features only

# Select numeric columns only
numeric_df = train_df.select_dtypes(include=['int32', 'int64', 'float64'])

corr_matrix = numeric_df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Matrix of Numeric Features")
plt.show()

In [None]:
#OHE for correlation matrix

# One-hot encode categorical features
encoded_df = pd.get_dummies(train_df.drop(columns=['pred']), drop_first=True)

# Add encoded label for correlation study
encoded_df['target'] = train_df['pred'].apply(lambda x: 1 if x == '>50K' else 0)

corr = encoded_df.corr()

plt.figure(figsize=(16, 12))
sns.heatmap(corr, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Full Correlation Matrix (After One-Hot Encoding)")
plt.show()

In [None]:
# Convert to pairs of correlations
corr_pairs = corr.unstack()

# Sort by absolute value, descending
corr_pairs_sorted = corr_pairs.abs().sort_values(ascending=False)
corr_pairs_sorted = corr_pairs_sorted[corr_pairs_sorted < 0.999]  

corr_pairs_sorted

In [None]:
# Convert pred to binary
train_df['target'] = train_df['pred'].apply(lambda x: 1 if x == '>50K' else 0)

# One-hot encode ALL features except target
encoded = pd.get_dummies(train_df.drop(columns=['pred']), drop_first=True)

# Compute correlation with target only
corr_target = encoded.corr()['target'].sort_values(ascending=False)

corr_target

In [None]:
plt.figure(figsize=(6,10))
sns.heatmap(corr_target.to_frame(), annot=True, cmap='coolwarm')
plt.title("Correlation of Each Feature with Target")
plt.show()

In [None]:
# Remove native-country
encoded_filtered = encoded.drop(columns=[col for col in encoded.columns 
                                         if col.startswith("native-country_")])

# Remove numeric columns
numeric_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain',
                'capital-loss', 'hours-per-week', 'target']  
encoded_filtered = encoded_filtered.drop(columns=[col for col in numeric_cols 
                                                  if col in encoded_filtered.columns])

# Compute correlation with target only (SORTED DESCENDING)
corr_target_only = encoded_filtered.join(train_df['target']) \
                                   .corr()['target'] \
                                   .sort_values(ascending=False)


In [None]:
plt.figure(figsize=(6, 20))
sns.heatmap(corr_target_only.to_frame(),
            annot=True,
            cmap='coolwarm',
            vmin=-0.4,
            vmax=0.45)
plt.title("Categorical Feature Correlations With Target")
plt.show()

In [None]:
# Full correlation matrix among remaining categorical columns
cat_corr_matrix = encoded_filtered.corr()

In [None]:
plt.figure(figsize=(22, 20))
sns.heatmap(cat_corr_matrix, cmap='coolwarm', center=0)
plt.title("Correlation Matrix of Categorical Features")
plt.show()

## Normalization --- Jieun's Part

capital-gain and capital-loss variables are extremely right-skewed with heavy zeros and a few large outliers. Standard normalization (like Minâ€“Max or Z-score) will not work well.

In [None]:
#Log transformation (best for heavy right-skew)
log_train_df = train_df.copy()

log_train_df['capital_gain_log'] = np.log1p(log_train_df['capital-gain'])
log_train_df['capital_loss_log'] = np.log1p(log_train_df['capital-loss'])

#log1p(x) handles zeros safely.
#Compresses extreme values.
#Spreads out dense low-value regions.

In [None]:
#capital gain
plt.subplot(1, 2, 1)
plt.hist(log_train_df['capital_gain_log'], bins=50)
plt.title("Capital_gain_log")
plt.xlabel("capital_gain_log")
plt.ylabel("Count")

#capital loss
plt.subplot(1, 2, 2)
plt.hist(log_train_df['capital_loss_log'], bins=50)
plt.title("Capital_loss_log")
plt.xlabel("capital_loss_log")
plt.ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
log_train_df

In [None]:
#Binning
bin_train_df = train_df.copy()

# capital-gain bins
gain_bins = [-1, 0, 5000, 15000, 30000, np.inf]
gain_labels = ['no_gain', 'low_gain', 'medium_gain', 'high_gain', 'very_high_gain']

bin_train_df['capital_gain_bin'] = pd.cut(
    bin_train_df['capital-gain'], 
    bins=gain_bins, 
    labels=gain_labels
)

# capital-loss bins
loss_bins = [-1, 0, 1000, 2000, np.inf]
loss_labels = ['no_loss', 'low_loss', 'medium_loss', 'high_loss']

bin_train_df['capital_loss_bin'] = pd.cut(
    bin_train_df['capital-loss'],
    bins=loss_bins,
    labels=loss_labels
)

bin_train_df.drop(['capital-gain','capital-loss'],axis=1, inplace=True)

print(bin_train_df['capital_gain_bin'].value_counts(), "\n\n")
print(bin_train_df['capital_loss_bin'].value_counts())

In [None]:
plt.figure(figsize=(8, 4))
bin_train_df['capital_gain_bin'].value_counts().sort_index().plot(kind='bar')
plt.title("Capital Gain Bin Counts")
plt.xlabel("Capital Gain Category")
plt.ylabel("Count")
plt.show()

bin_train_df['capital_loss_bin'].value_counts().sort_index().plot(kind='bar')
plt.title("Capital Loss Bin Counts")
plt.xlabel("Capital Loss Category")
plt.ylabel("Count")
plt.show()

In [None]:
bin_train_df

In [None]:
#Binary Indicators
binary_train_df = train_df.copy()

binary_train_df['has_capital_gain'] = (binary_train_df['capital-gain'] > 0).astype(int)
binary_train_df['has_capital_loss'] = (binary_train_df['capital-loss'] > 0).astype(int)

binary_train_df.drop(['capital-gain','capital-loss'], axis = 1, inplace = True)

In [None]:
binary_train_df['has_capital_gain'].value_counts().sort_index().plot(
    kind='bar',
    figsize=(6,4),
    rot=0
)
plt.title('Capital_gain')
plt.xlabel('Capital_gain')
plt.ylabel('Count')
plt.show()

binary_train_df['has_capital_loss'].value_counts().sort_index().plot(
    kind='bar',
    figsize=(6,4),
    rot=0
)
plt.title('Capital_loss')
plt.xlabel('Capital_loss')
plt.ylabel('Count')
plt.show()

In [None]:
binary_train_df

## Shelsy's Part

In [None]:
train_df

In [None]:
#undersampling
X_train = train_df.drop ('pred', axis=1)
Y_train = train_df['pred']

under= RandomUnderSampler(random_state=42)
X_under, Y_under = under.fit_resample(X_train, Y_train)

print ('before:' , Y_train.value_counts())
print ('after:' , Y_under.value_counts())

In [None]:
#oversampling
over = RandomOverSampler(random_state=42)
X_over, Y_over = over.fit_resample(X_train, Y_train)
print ('before:' , Y_train.value_counts())
print ('after:' , Y_over.value_counts())

In [None]:
train_df['capital-gain'].describe()