In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [None]:
class Preprocessor:
    def __init__(self, df):
        self.df = df.copy()

    def handle_missing_values(self):
        self.df.fillna(0, inplace=True)

    def drop_outlier(self):
        numerical_columns = self.df.select_dtypes(include=[np.number]).columns
        for col in numerical_columns:
            values = self.df[col]
            q25, q75 = np.percentile(values, [25, 75])
            IQR = (q75 - q25) * 1.5
            self.df = self.df[~((values > q75 + IQR) | (values < q25 - IQR))]
    # Annual Income is skewed right so log transformation helps improve model performance
    def income_transformer(self):
        self.df['Log Income'] =  np.log(self.df['Annual Income (k$)'])

    def standardization(self):
        numerical_columns = self.df.select_dtypes(include=[np.number]).columns
        scaler = StandardScaler()
        self.df[numerical_columns] = scaler.fit_transform(self.df[numerical_columns])

    def drop_columns(self):
        if self.keep_categorical:
            self.df = self.df[['Log Income', 'Spending Score (1-100)', 'Gender']]
        else:
            self.df = self.df[['Log Income', 'Spending Score (1-100)']]

    def transform(self, keep_categorical=False):
        self.keep_categorical = keep_categorical
        self.income_transformer()
        self.drop_columns()
        self.handle_missing_values()
        # self.drop_outlier()
        self.standardization()
        return self.df
