In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np

In [None]:
# Load the Dataset
data = pd.read_csv("../data/income.csv")
print("Initial shape:", data.shape)

Initial shape: (48842, 15)


In [None]:
# Select Only Relevant Features for the Income Prediction Model
selected_features = ["age","workclass","education","marital-status","occupation","relationship","gender","hours-per-week","capital-gain","capital-loss","income"]
data = data[selected_features]

In [None]:
# Remove Duplicate Entries
data = data.drop_duplicates()
print("Shape after removing duplicates:", data.shape)

Shape after removing duplicates: (40437, 11)


In [None]:
# Handle Missing and Categorical Data
# Replace placeholder values (?) with proper NaN values 
data = data.replace("?", np.nan)

# Define categorical columns for processing
categorical_cols = ["workclass", "education", "marital-status", 
                    "occupation", "relationship", 
                    "gender"]

# Fill missing categorical values with "Unknown" instead of dropping rows
for col in categorical_cols:
    data[col] = data[col].fillna("Unknown")

# Convert income target variable to binary format (1 for >50K, 0 for <=50K)
data["income"] = data["income"].astype(str).str.strip()
data["income"] = data["income"].apply(lambda x: 1 if x == ">50K" else 0)

# Verify no missing values remain in the dataset
print("Missing values after handling:\n", data.isnull().sum())

Missing values after handling:
 age               0
workclass         0
education         0
marital-status    0
occupation        0
relationship      0
gender            0
hours-per-week    0
capital-gain      0
capital-loss      0
income            0
dtype: int64
 age               0
workclass         0
education         0
marital-status    0
occupation        0
relationship      0
gender            0
hours-per-week    0
capital-gain      0
capital-loss      0
income            0
dtype: int64


In [None]:
# Handle Outliers in Numeric Features
# Define numeric columns that need outlier treatment
numeric_cols = ["age","capital-gain", "capital-loss", "hours-per-week"]

# Define function to remove outliers using IQR method
def remove_outliers_iqr(df, col):
    # Calculate first and third quartiles
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    # Calculate interquartile range
    IQR = Q3 - Q1
    # Define bounds for outlier detection
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    # Return only rows within the acceptable range
    return df[(df[col] >= lower) & (df[col] <= upper)]

# Apply outlier removal to each numeric column
for col in numeric_cols:
    before = data.shape[0]
    data = remove_outliers_iqr(data, col)
    after = data.shape[0]
    print(f"{col}: removed {before - after} outliers")

age: removed 183 outliers
capital-gain: removed 3965 outliers
capital-loss: removed 2248 outliers
hours-per-week: removed 6769 outliers


In [None]:
# Verify Final Dataset
print("\nFinal dataset shape:", data.shape)
print(data.head())


Final dataset shape: (27272, 11)
   age  workclass     education      marital-status         occupation  \
0   25    Private          11th       Never-married  Machine-op-inspct   
1   38    Private       HS-grad  Married-civ-spouse    Farming-fishing   
2   28  Local-gov    Assoc-acdm  Married-civ-spouse    Protective-serv   
4   18    Unknown  Some-college       Never-married            Unknown   
5   34    Private          10th       Never-married      Other-service   

    relationship  gender  hours-per-week  capital-gain  capital-loss  income  
0      Own-child    Male              40             0             0       0  
1        Husband    Male              50             0             0       0  
2        Husband    Male              40             0             0       1  
4      Own-child  Female              30             0             0       0  
5  Not-in-family    Male              30             0             0       0  


In [None]:
# Save Preprocessed Dataset
data.to_csv("../data/income_cleaned.csv", index=False)
print("\n✅ Cleaned dataset saved as '../data/income_cleaned.csv'")


✅ Cleaned dataset saved as '../data/income_cleaned.csv'
