In [10]:
PROC_TRAIN_DATA_PATH = "../data/processed/1__preprocessed_df_trina.pkl"
PROC_TEST_DATA_PATH = "../data/processed/1__preprocessed_df_test.pkl"

# Import packages

In [11]:
import pandas as pd

# Data reading

In [12]:
df_train = pd.read_csv("../data/raw/train.csv")
df_test = pd.read_csv("../data/raw/test.csv")

# Remove duplicates

In [13]:

def remove_duplicates(data:pd.DataFrame) -> pd.DataFrame:
    """ Remove duplicates values if exist"""
    data = data.drop_duplicates()
    return data

df_train_modified = remove_duplicates(df_train)

# Fill missing values

In [14]:
# Change missing values in some features to corresponding values given in data description file
def fill_cat_missing_values(data:pd.DataFrame) -> pd.DataFrame:
    """
    Filling the missing values in categorical variables

    Params:
    -------
    data:pd.DataFrame passing data to be processed
    """
    data = data.copy()

    cat_fill_missing_vals = {
        'PoolQC': 'No Pool',
        'Fence': 'No Fence',
        'MiscFeature': 'None',
        'Alley':'No alley access',
        'FireplaceQu' : 'No Fireplace',
        'GarageFinish' : 'No Garage',
        'GarageCond' : 'No Garage',
        'GarageQual' : 'No Garage',
        'GarageType' : 'No Garage',
        'BsmtExposure' : 'No Basement',
        'BsmtFinType2' : 'No Basement',
        'BsmtQual' : 'No Basement',
        'BsmtCond' : 'No Basement',
        'BsmtFinType1' : 'No Basement',
        'MasVnrType' : 'None',
    }

    # for col, fill_val in cat_fill_missing_vals.items():
    #     data[col] = data[col].fillna(fill_val)
    cols = list(cat_fill_missing_vals.keys())
    data[cols] = data[cols].apply(lambda col: col.fillna(cat_fill_missing_vals[col.name]))

    return data


def fill_num_missing_values(data:pd.DataFrame) -> pd.DataFrame:
    """
    Filling the missing values in numerical variables

    Params:
    -------
    data:pd.DataFrame passing data to be processed
    """

    data = data.copy()

    # Fill by mean for MasVnrArea feature
    data['MasVnrArea'] = data.groupby(by='MasVnrType')['MasVnrArea'].transform(
    lambda x: x.fillna(x.mean())
    )

    return data
    
df_train = fill_cat_missing_values(df_train)
df_train = fill_num_missing_values(df_train)

# Remove vars and values

In [15]:
def remove_vars(data:pd.DataFrame) -> pd.DataFrame:
    """
    Remove variables from data

    Params:
    -------
    data:pd.DataFrame passing data to be processed
    """

    data = data.copy()

    # Drop LotFrontage & GarageYrBlt features
    if({'LotFrontage','GarageYrBlt'}.issubset(data.columns) == False):
        return

    data = data.drop(columns=['LotFrontage','GarageYrBlt'])
    return data

def remove_values(data:pd.DataFrame) -> pd.DataFrame:
    """
    Remove values from the data
    """ 

    data = data.copy()
    # Remove Electrical feature
    if('Electrical' not in data.columns):
        return

    droped_index = data[data['Electrical'].isnull()].index
    proc_data = data.drop(index=droped_index)

    return proc_data

df_train = remove_vars(df_train)
df_train = remove_values(df_train)

df_test = remove_vars(df_test)

# Remove outliers

In [16]:
def detect_outliers_manual(data:pd.DataFrame) -> list[pd.Index]:
    indexes = data.query(
                """
                LotArea > 100_000 | \
                BsmtFinSF1 > 5_000 | \
                BsmtFinSF2 > 1_400 | \
                TotalBsmtSF > 6_000 | \
                GrLivArea > 4_000 | \
                GarageArea > 1_300 | \
                WoodDeckSF > 800 | \
                EnclosedPorch > 500 | \
                SalePrice > 700_000 
                """
            ).index

    return indexes 

def remove_outliers(data:pd.DataFrame, indexes:list[pd.Index]) -> pd.DataFrame:
    """Remove outliers for the given indexes"""
    return data.drop(index=indexes)


df_train = remove_outliers(df_train, indexes=detect_outliers_manual(df_train))

# Save data

In [17]:
# Remove outliers from training data
# categorical_feats_modified, numerical_feats_modified = split_features_by_type(df_train_modified)

# Save the processed data
df_train.to_pickle(PROC_TRAIN_DATA_PATH)
df_test.to_pickle(PROC_TEST_DATA_PATH)