In [101]:
# Import Required Libraries
import pandas as pd
import numpy as np

In [102]:
# Load the Dataset
data = pd.read_csv("../data/income.csv")
print("Initial shape:", data.shape)

Initial shape: (48842, 15)


In [103]:
selected_features = ["age","workclass","education","marital-status","occupation","relationship","gender","hours-per-week","capital-gain","capital-loss","income"]
data = data[selected_features]

In [104]:
# Remove Duplicate Entries
data = data.drop_duplicates()
print("Shape after removing duplicates:", data.shape)

Shape after removing duplicates: (40437, 11)


In [None]:
# Handle Outliers in Numeric Features
numeric_cols_iqr = ["age", "hours-per-week"]   # Apply IQR here
skewed_cols = ["capital-gain", "capital-loss"] # Transform instead of remove

# Function: remove outliers with IQR
def remove_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[col] >= lower) & (df[col] <= upper)]

# Apply IQR only on selected numeric cols
for col in numeric_cols_iqr:
    before = data.shape[0]
    data = remove_outliers_iqr(data, col)
    after = data.shape[0]
    print(f"{col}: removed {before - after} outliers")

# Apply log transformation to skewed features
import numpy as np
for col in skewed_cols:
    data[col] = np.log1p(data[col])   # log(1 + x) keeps 0 as 0
    print(f"{col}: applied log transformation")

age: removed 183 outliers
hours-per-week: removed 8970 outliers
capital-gain: applied log transformation
capital-loss: applied log transformation


In [106]:
# Handle Outliers in Numeric Features
# Define numeric columns that need outlier treatment
numeric_cols = ["age","capital-gain", "capital-loss", "hours-per-week"]

# Define function to remove outliers using IQR method
def remove_outliers_iqr(df, col):
    # Calculate first and third quartiles
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    # Calculate interquartile range
    IQR = Q3 - Q1
    # Define bounds for outlier detection
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    # Return only rows within the acceptable range
    return df[(df[col] >= lower) & (df[col] <= upper)]

# Apply outlier removal to each numeric column
for col in numeric_cols:
    before = data.shape[0]
    data = remove_outliers_iqr(data, col)
    after = data.shape[0]
    print(f"{col}: removed {before - after} outliers")

age: removed 75 outliers
capital-gain: removed 3168 outliers
capital-loss: removed 1806 outliers
hours-per-week: removed 2736 outliers


In [107]:
import pandas as pd

def preprocess_income_data(df):
    # --------------------
    # Workclass
    # --------------------
    workclass_map = {
        'Private': 'Private',
        'Self-emp-not-inc': 'Private',
        'Self-emp-inc': 'Private',
        'Federal-gov': 'Government',
        'State-gov': 'Government',
        'Local-gov': 'Government',
        'Without-pay': 'Other/Unemployed',
        'Never-worked': 'Other/Unemployed',
        '?': 'Other/Unemployed',
        'Unknown': 'Unknown'
    }
    df['workclass'] = df['workclass'].map(workclass_map)

    # --------------------
    # Education
    # --------------------
    education_map = {
    'Preschool': 'School',
    '1st-4th': 'School',
    '5th-6th': 'School',
    '7th-8th': 'School',
    '9th': 'School',
    '10th': 'School',
    '11th': 'School',
    '12th': 'School',

    'HS-grad': 'High School',

    'Some-college': 'Undergraduate',
    'Assoc-acdm': 'Undergraduate',
    'Assoc-voc': 'Undergraduate',
    'Bachelors': 'Undergraduate',

    'Masters': 'Postgraduate',
    'Doctorate': 'Postgraduate',
    'Prof-school': 'Postgraduate'
}

    df['education'] = df['education'].map(education_map)

    # --------------------
    # Marital Status
    # --------------------
    marital_map = {
        'Never-married': 'Single',
        'Married-civ-spouse': 'Married',
        'Married-AF-spouse': 'Married',
        'Married-spouse-absent': 'Married',
        'Divorced': 'Previously married',
        'Separated': 'Previously married',
        'Widowed': 'Previously married'
    }
    df['marital-status'] = df['marital-status'].map(marital_map)

    # --------------------
    # Occupation
    # --------------------
    occupation_map = {
        'Craft-repair': 'Blue collar',
        'Transport-moving': 'Blue collar',
        'Handlers-cleaners': 'Blue collar',
        'Farming-fishing': 'Blue collar',
        'Machine-op-inspct': 'Blue collar',
        'Other-service': 'Service',
        'Priv-house-serv': 'Service',
        'Protective-serv': 'Service',
        'Exec-managerial': 'White collar',
        'Prof-specialty': 'White collar',
        'Sales': 'White collar',
        'Adm-clerical': 'White collar',
        'Tech-support': 'White collar',
        'Armed-Forces': 'Military',
        '?': 'Unknown',
        'Unknown': 'Unknown'
    }
    df['occupation'] = df['occupation'].map(occupation_map)

    # --------------------
    # Relationship
    # --------------------
    relationship_map = {
        'Husband': 'Partnered',
        'Wife': 'Partnered',
        'Own-child': 'Child',
        'Other-relative': 'Other family',
        'Not-in-family': 'Independent',
        'Unmarried': 'Independent'
    }
    df['relationship'] = df['relationship'].map(relationship_map)

    return df


In [108]:
# Keep only selected features
data = data[selected_features]

# Preprocess categorical features
data = preprocess_income_data(data)

# Check cleaned categories
print(data['workclass'].value_counts())
print(data['education'].value_counts())
print(data['occupation'].value_counts())


workclass
Private             18400
Government           4149
Other/Unemployed      950
Name: count, dtype: int64
education
Undergraduate    10921
High School       7297
School            3481
Postgraduate      1800
Name: count, dtype: int64
occupation
White collar    12106
Blue collar      7508
Service          2930
Unknown           944
Military           11
Name: count, dtype: int64


In [109]:
# Verify Final Dataset
print("\nFinal dataset shape:", data.shape)
print(data.head())


Final dataset shape: (23499, 11)
   age         workclass      education marital-status   occupation  \
0   25           Private         School         Single  Blue collar   
1   38           Private    High School        Married  Blue collar   
2   28        Government  Undergraduate        Married      Service   
6   29  Other/Unemployed    High School         Single      Unknown   
8   24           Private  Undergraduate         Single      Service   

  relationship  gender  hours-per-week  capital-gain  capital-loss income  
0        Child    Male              40           0.0           0.0  <=50K  
1    Partnered    Male              50           0.0           0.0  <=50K  
2    Partnered    Male              40           0.0           0.0   >50K  
6  Independent    Male              40           0.0           0.0  <=50K  
8  Independent  Female              40           0.0           0.0  <=50K  


In [110]:
# Save Preprocessed Dataset
data.to_csv("../data/income_cleaned.csv", index=False)
print("\n✅ Cleaned dataset saved as '../data/income_cleaned.csv'")


✅ Cleaned dataset saved as '../data/income_cleaned.csv'
