In [36]:
import os
import pandas as pd
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


def preprocessing(file_name):
    PATH = os.path.join('..', 'data', 'house-prices-advanced-regression-techniques')
    FILE_PATH = os.path.join(PATH, file_name)

    df = pd.read_csv(FILE_PATH)
    # Load the data set
    col_drop = df.columns[df.nunique()==1]
    df.drop(col_drop, axis=1, inplace=True)
    df.head()

    # Preprocessing the dataset

    # Spliting the training, validation, and test sets.
    Xtrain = df.copy()
    ytrain = Xtrain.loc[:,['SalePrice']]
    Xtrain = Xtrain.drop('SalePrice', axis = 1)
    # Xtrain.head()

    # split train and validation dataset
    Xtrain, Xtest, ytrain, ytest = train_test_split(Xtrain, ytrain, test_size = 0.2, random_state = 42)
    Xval, Xtest, yval, ytest = train_test_split(Xtest, ytest, test_size = 0.5, random_state = 42)

    # Encoding the datasets

    # get the binary categorical colums
    bin_cols = Xtrain.select_dtypes(include=['object']).columns[Xtrain.select_dtypes(include=['object']).nunique() == 2].tolist()
    # get the rest categorical columns
    ord_cols = [col for col in Xtrain.columns if col not in bin_cols and Xtrain[col].dtype == 'object']
    # get the numerical categorical columns
    num_cols = Xtrain.select_dtypes(include=['int64', 'float64']).columns
    # ensure the numerical columns have only numerical values
    Xtrain[num_cols] = Xtrain[num_cols].apply(pd.to_numeric, errors='coerce')

    # create encoder
    std = StandardScaler()
    cte = ce.CountEncoder(cols=ord_cols, normalize=True, handle_unknown='value')
    ohe = ce.OneHotEncoder(cols=bin_cols, use_cat_names=False, handle_unknown='value')
    be = ce.BinaryEncoder(cols=bin_cols)


    # encode the price with StandardScaler
    std_label = StandardScaler()
    ytrain_encoded = pd.DataFrame(std_label.fit_transform(ytrain), columns=ytrain.columns)
    yval_encoded = pd.DataFrame(std_label.transform(yval), columns = yval.columns)


    # encoding the features
    # one-hot encoding the binary features
    Xtrain_encoded = ohe.fit_transform(Xtrain)
    Xval_encoded = ohe.transform(Xval)
    Xtest_encoded = ohe.transform(Xtest)

    Xtrain_encoded = cte.fit_transform(Xtrain_encoded)
    Xval_encoded = cte.transform(Xval_encoded)
    Xtest_encoded = cte.transform(Xtest_encoded)

    Xtrain_scaled = std.fit_transform(Xtrain_encoded[num_cols])
    Xtrain_encoded[num_cols] = Xtrain_scaled
    Xval_scaled = std.transform(Xval_encoded[num_cols])
    Xval_encoded[num_cols] = Xval_scaled
    Xtest_scaled = std.transform(Xtest_encoded[num_cols])
    Xtest_encoded[num_cols] = Xtest_scaled
    
    # save the processed dataframes in parquet format
    # processed_df.to_parquet('/my/filapth/processed_df.parquet', index=False)
    Xtrain_encoded.to_parquet('../data/house-prices-advanced-regression-techniques/Xtrain_encoded.parquet', index = True)
    ytrain_encoded.to_parquet('../data/house-prices-advanced-regression-techniques/ytrain_encoded.parquet', index = False)
    Xval_encoded.to_parquet( '../data/house-prices-advanced-regression-techniques/Xval_encoded.parquet', index = True)
    yval_encoded.to_parquet('../data/house-prices-advanced-regression-techniques/yval_encoded.parquet', index = False)
    Xtest_encoded.to_parquet('../data/house-prices-advanced-regression-techniques/Xtest_encoded.parquet', index = True)
    
    return Xtrain_encoded, ytrain_encoded, Xval_encoded, yval_encoded, Xtest_encoded


In [37]:
Xtrain_encoded, ytrain_encoded, Xval_encoded, yval_encoded, Xtest_encoded = preprocessing('train.csv')

In [38]:
# correctly_processed_df = pd.read_parquet('/my/filapth/processed_df.parquet')
Xtrain_processed_df = pd.read_parquet('../data/house-prices-advanced-regression-techniques/Xtrain_encoded.parquet')
Xval_processed_df = pd.read_parquet('../data/house-prices-advanced-regression-techniques/Xval_encoded.parquet')
Xtest_processed_df = pd.read_parquet('../data/house-prices-advanced-regression-techniques/Xtest_encoded.parquet')

In [39]:
ytrain_processed_df = pd.read_parquet('../data/house-prices-advanced-regression-techniques/ytrain_encoded.parquet')
yval_processed_df = pd.read_parquet('../data/house-prices-advanced-regression-techniques/yval_encoded.parquet')

In [40]:
# pd.testing.assert_frame_equal(my_new_processed_df, correctly_processed_df)
pd.testing.assert_frame_equal(Xtrain_encoded, Xtrain_processed_df)

In [41]:
pd.testing.assert_frame_equal(Xval_encoded, Xval_processed_df)

In [42]:
pd.testing.assert_frame_equal(Xtest_encoded, Xtest_processed_df)

In [43]:
pd.testing.assert_frame_equal(ytrain_encoded, ytrain_processed_df)

In [44]:
pd.testing.assert_frame_equal(yval_encoded, yval_processed_df)