In [47]:
TRAIN_DATA_PATH = "../data/raw/train.csv"
PROC_TRAIN_DATA_PATH = "../data/interim/1__analytics_preprocessed_df.pkl"

# Import packages

In [48]:
import pandas as pd
import numpy as np

In [49]:
# Change some pandas display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.width', 1000)
pd.set_option("styler.format.precision", 10)

In [50]:
raw_df_train = pd.read_csv(TRAIN_DATA_PATH)

# Functions

In [51]:
def split_features_by_type(data:pd.DataFrame):
    """
    Split features to categorical and numerical features

    Params:
    -------
    * data: DataFrame

    Return:
    -------
    categorical_feats: list of cat columns 
    numerical_feats: list of num columns
    """
    categorical_feats = data.select_dtypes(include=['object']).columns
    numerical_feats = data.select_dtypes(np.number).columns

    return categorical_feats, numerical_feats

# Changing data type
- After investigating the data, Some feature have numeric values but they are **categorical features**
- **OverallQual** & **OverallCond** will be edited to corresponding string values.
- 'MSSubClass', 
    'BsmtFullBath', 
    'BsmtHalfBath', 
    'FullBath', 
    'HalfBath', 
    'BedroomAbvGr', 
    'KitchenAbvGr', 
    'TotRmsAbvGrd', 
    'Fireplaces',
    'GarageCars',
    '3SsnPorch',
    'MiscVal',
    'MoSold',
    'YrSold',
    'OverallQual', 
    'OverallCond'

In [52]:
def num2str(data:pd.DataFrame) -> pd.DataFrame:
        """
        Change numeric to string values in OverallQual and OverallCond features
        
        Params:
        --------
        * data: pd.DataFrame the data frame that should be processed

        Return:
        --------
        proc_data: pd.DataFrame the processed data frame

        """
        num2str_values = {
                10:	"Very Excellent",
                9:	"Excellent",
                8:	"Very Good",
                7:	"Good",
                6:	"Above Average",
                5:	"Average",
                4:	"Below Average",
                3:	"Fair",
                2:	"Poor",
                1:	"Very Poor"
                }

        data['OverallQual'] = data['OverallQual'].replace(num2str_values)
        data['OverallCond'] = data['OverallCond'].replace(num2str_values)

        return data

def to_object(data:pd.DataFrame) -> pd.DataFrame:
        """
        Change numeric to string values in OverallQual and OverallCond features
        
        Params:
        --------
        * data: pd.DataFrame the data frame that should be processed

        Return:
        --------
        proc_data: pd.DataFrame the processed data frame

        """
        # Change some features to categorical 
        numbered_cat_feats = [
                'MSSubClass', 
                'BsmtFullBath', 
                'BsmtHalfBath', 
                'FullBath', 
                'HalfBath', 
                'BedroomAbvGr', 
                'KitchenAbvGr', 
                'TotRmsAbvGrd', 
                'Fireplaces',
                'GarageCars',
                '3SsnPorch',
                'MiscVal',
                'MoSold',
                'YrSold',
                'OverallQual', 
                'OverallCond'
        ]

        data[numbered_cat_feats] = data[numbered_cat_feats].astype('category')

        return data

def edit_df(data:pd.DataFrame) -> pd.DataFrame:
        """
        Change numeric to string values in OverallQual and OverallCond features
        
        Params:
        --------
        * data: pd.DataFrame the data frame that should be processed

        Return:
        --------
        proc_data: pd.DataFrame the processed data frame

        """
        
        proc_data = to_object(num2str(data))
        # Change Fence name -> FenceQual
        proc_data = proc_data.rename(columns={"Fence": "FenceQual"})
        
        return proc_data

df_train = edit_df(raw_df_train)

categorical_feats, numerical_feats = split_features_by_type(df_train)

# Visually verify result

In [53]:
arbitrary_sample_index = df_train.sample(1).index
print(df_train[['OverallQual','OverallCond']].iloc[arbitrary_sample_index])

     OverallQual OverallCond
1236  Good        Average   


In [54]:
df_train.to_pickle(PROC_TRAIN_DATA_PATH)