In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd

In [4]:
def preprocess_data_with_target(df, target):
    """
    Preprocess dataset with target variable:
      - Handles missing values
      - Encodes categorical features
      - Standardizes numerical features
      - Separates features (X) and target (y)
    
    Parameters
    ----------
    df : pandas.DataFrame
        The dataset to preprocess.
    target : str
        The name of the target column.

    Returns
    -------
    X : pandas.DataFrame
        Processed feature matrix
    y : pandas.Series
        Encoded target column
    encoders : dict
        LabelEncoders used for categorical columns
    scaler : StandardScaler
        Scaler fitted on numeric columns
    """

    print(f"First 5 Rows of Data before transformation: \n{df.head()}\n********************* \n")

    # Handle Missing Values
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:  # numeric
            df[col] = df[col].fillna(df[col].mean())
        else:  # categorical
            df[col] = df[col].fillna(df[col].mode()[0])

    # Separate Features and Target
    X = df.drop(columns=[target])
    y = df[target]

    # Encode Categorical Columns
    encoders = {}
    for col in X.select_dtypes(include=['object', 'category']).columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        encoders[col] = le

    # Encode target if categorical
    if y.dtype == 'object' or str(y.dtype) == 'category':
        le_target = LabelEncoder()
        y = le_target.fit_transform(y)
        encoders[target] = le_target

    # Standardize Numerical Features
    scaler = StandardScaler()
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

    print(f"\n********************* \nFirst 5 Rows of X after transformation: \n{X.head()}\n*********************")
    print(f"First 5 Rows of y after transformation: \n{y[:5]}\n********************* \n")

    return X, y, encoders, scaler


In [5]:
# Load project_adult.csv as pa
pa = pd.read_csv("project_adult.csv")

# Apply preprocess function on pa
pa_X, pa_y, pa_encoders, pa_scaler = preprocess_data_with_target(df = pa, target = "income")

First 5 Rows of Data before transformation: 
   Unnamed: 0  age         workclass  fnlwgt     education  education-num  \
0        5514   33         Local-gov  198183     Bachelors             13   
1       19777   36           Private   86459     Assoc-voc             11   
2       10781   58  Self-emp-not-inc  203039           9th              5   
3       32240   21           Private  180190     Assoc-voc             11   
4        9876   27           Private  279872  Some-college             10   

       marital-status       occupation   relationship   race     sex  \
0       Never-married   Prof-specialty  Not-in-family  White  Female   
1  Married-civ-spouse  Exec-managerial        Husband  White    Male   
2           Separated     Craft-repair  Not-in-family  White    Male   
3  Married-civ-spouse  Farming-fishing        Husband  White    Male   
4            Divorced    Other-service  Not-in-family  White    Male   

   capital-gain  capital-loss  hours-per-week native-countr

In [6]:
def preprocess_data_no_target(df):
    """
    Preprocess dataset without a target variable:
      - Handles missing values
      - Encodes categorical features
      - Standardizes numerical features

    Parameters
    ----------
    df : pandas.DataFrame
        The dataset to preprocess.

    Returns
    -------
    X : pandas.DataFrame
        Processed feature matrix
    encoders : dict
        LabelEncoders used for categorical columns
    scaler : StandardScaler
        Scaler fitted on numeric columns
    """

    print(f"First 5 Rows of Data before transformation: \n{df.head()}\n********************* \n")

    # ---- Handle Missing Values ----
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:  # numeric
            df[col] = df[col].fillna(df[col].mean())
        else:  # categorical
            df[col] = df[col].fillna(df[col].mode()[0])

    # ---- Encode Categorical Columns ----
    encoders = {}
    for col in df.select_dtypes(include=['object', 'category']).columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        encoders[col] = le

    # ---- Standardize Numerical Features ----
    scaler = StandardScaler()
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    print(f"\n********************* \nFirst 5 Rows of Data after transformation: \n{df.head()}\n*********************")

    return df, encoders, scaler


In [7]:
# Load project_validation_inputs.csv as pv
pv = pd.read_csv("project_validation_inputs.csv")

# Apply preprocess function on pa
pv_X, pv_encoders, pv_scaler = preprocess_data_no_target(df = pv)

First 5 Rows of Data before transformation: 
   Unnamed: 0  age         workclass  fnlwgt     education  education-num  \
0       14160   27           Private  160178  Some-college             10   
1       27048   45         State-gov   50567       HS-grad              9   
2       28868   29           Private  185908     Bachelors             13   
3        5667   30           Private  190040     Bachelors             13   
4        7827   29  Self-emp-not-inc  189346  Some-college             10   

       marital-status         occupation   relationship   race     sex  \
0            Divorced       Adm-clerical  Not-in-family  White  Female   
1  Married-civ-spouse    Exec-managerial           Wife  White  Female   
2  Married-civ-spouse    Exec-managerial        Husband  Black    Male   
3       Never-married  Machine-op-inspct  Not-in-family  White  Female   
4            Divorced       Craft-repair  Not-in-family  White    Male   

   capital-gain  capital-loss  hours-per-week n