In [45]:
# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [46]:
# Load the Dataset
data = pd.read_csv("../data/income.csv")
print("Initial shape:", data.shape)

Initial shape: (48842, 15)


In [47]:
# Remove Duplicate Entries
data = data.drop_duplicates()
print("Shape after removing duplicates:", data.shape)

Shape after removing duplicates: (48790, 15)


In [48]:
# Handle Missing and Categorical Data
# Replace placeholder values (?) with proper NaN values 
data = data.replace("?", np.nan)

# Define categorical columns for processing
categorical_cols = ["workclass", "education", "marital-status", 
                    "occupation", "relationship", "race", 
                    "gender", "native-country"]

# Fill missing categorical values with "Unknown" instead of dropping rows
for col in categorical_cols:
    data[col] = data[col].fillna("Unknown")

# Convert income target variable to binary format (1 for >50K, 0 for <=50K)
data["income"] = data["income"].astype(str).str.strip()
data["income"] = data["income"].apply(lambda x: 1 if x == ">50K" else 0)

# Verify no missing values remain in the dataset
print("Missing values after handling:\n", data.isnull().sum())

Missing values after handling:
 age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64


In [49]:
# Handle Outliers in Numeric Features
# Define numeric columns that need outlier treatment
numeric_cols = ["age", "fnlwgt", "educational-num", 
                "capital-gain", "capital-loss", "hours-per-week"]

# Define function to remove outliers using IQR method
def remove_outliers_iqr(df, col):
    # Calculate first and third quartiles
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    # Calculate interquartile range
    IQR = Q3 - Q1
    # Define bounds for outlier detection
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    # Return only rows within the acceptable range
    return df[(df[col] >= lower) & (df[col] <= upper)]

# Apply outlier removal to each numeric column
for col in numeric_cols:
    before = data.shape[0]
    data = remove_outliers_iqr(data, col)
    after = data.shape[0]
    print(f"{col}: removed {before - after} outliers")

age: removed 215 outliers
fnlwgt: removed 1452 outliers
educational-num: removed 1663 outliers
capital-gain: removed 3795 outliers
capital-loss: removed 2173 outliers
hours-per-week: removed 10969 outliers


In [50]:
# Encode Categorical Variables
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
print("Shape after One-Hot Encoding:", data.shape)

Shape after One-Hot Encoding: (28523, 96)


In [51]:
# Standardize Numeric Features
scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

In [52]:
# Verify Final Dataset
print("\nFinal dataset shape:", data.shape)
print(data.head())


Final dataset shape: (28523, 96)
        age    fnlwgt  educational-num  capital-gain  capital-loss  \
0 -1.081254  0.532649        -1.510762           0.0           0.0   
1 -0.004024 -1.041884        -0.579970           0.0           0.0   
2 -0.832663  1.798696         0.816217           0.0           0.0   
6 -0.749799  0.535223        -0.579970           0.0           0.0   
8 -1.164118  2.174732        -0.114575           0.0           0.0   

   hours-per-week  income  workclass_Local-gov  workclass_Never-worked  \
0       -0.371355       0                False                   False   
1        2.153655       0                False                   False   
2       -0.371355       1                 True                   False   
6       -0.371355       0                False                   False   
8       -0.371355       0                False                   False   

   workclass_Private  ...  native-country_Puerto-Rico  \
0               True  ...                  

In [53]:
# Save Preprocessed Dataset
data.to_csv("../data/income_cleaned.csv", index=False)
print("\n✅ Cleaned dataset saved as '../data/income_cleaned.csv'")


✅ Cleaned dataset saved as '../data/income_cleaned.csv'
