In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [7]:
train_path = "/content/drive/MyDrive/Colab Notebooks/ds_projects/train.csv"

In [8]:
train_data = pd.read_csv(train_path)

In [9]:
train_data.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


In [10]:
categorical_columns = list(train_data.select_dtypes(include="object").columns)
numerical_columns = list(train_data.select_dtypes(include=["float64", "int64"]).columns)

In [11]:
print(categorical_columns)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [12]:
print(numerical_columns)

['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']


In [13]:
numerical_columns.remove("SalePrice")
numerical_columns.remove("Id")

In [14]:
numerical_columns[0:5]

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond']

In [15]:
X = train_data[numerical_columns + categorical_columns]
y = train_data["SalePrice"]

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42)

In [17]:
numerical_pipeline = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "mean", missing_values = np.nan))
    ]
)

categorical_pipeline = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "most_frequent", missing_values = np.nan)), 
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False)) # when unknow category is encountered during prediction, all categories will be 0
    ]
)

# numerical_columns_index = train_data.columns.get_indexer(numerical_columns)
# categorical_columns_index = train_data.columns.get_indexer(categorical_columns)

imputer_pipeline = ColumnTransformer(
    transformers = [
        ("num", numerical_pipeline, numerical_columns), 
        ("cat", categorical_pipeline, categorical_columns)
    ]
)

final_pipeline = Pipeline(
    steps = [("preprocessor", imputer_pipeline)]
)

X_train = final_pipeline.fit_transform(X_train)



In [18]:
X_train = pd.DataFrame(X_train, columns = final_pipeline.get_feature_names_out())

In [19]:
X_val = final_pipeline.transform(X_val)
X_val = pd.DataFrame(X_val, columns = final_pipeline.get_feature_names_out())

In [20]:
print(X_train.shape, X_val.shape)

(1314, 287) (146, 287)


### does original categorical columns exist after encoding?

In [22]:
set(categorical_columns).intersection(set(X_train.columns))

set()

In [24]:
X_train.head(2)

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__SaleType_ConLw,cat__SaleType_New,cat__SaleType_Oth,cat__SaleType_WD,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
0,50.0,86.0,11500.0,7.0,7.0,1936.0,1987.0,0.0,223.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,20.0,67.0,16285.0,7.0,5.0,2001.0,2002.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [25]:
X_train.isnull().sum()

num__MSSubClass               0
num__LotFrontage              0
num__LotArea                  0
num__OverallQual              0
num__OverallCond              0
                             ..
cat__SaleCondition_AdjLand    0
cat__SaleCondition_Alloca     0
cat__SaleCondition_Family     0
cat__SaleCondition_Normal     0
cat__SaleCondition_Partial    0
Length: 287, dtype: int64

In [26]:
X_val.isnull().sum()

num__MSSubClass               0
num__LotFrontage              0
num__LotArea                  0
num__OverallQual              0
num__OverallCond              0
                             ..
cat__SaleCondition_AdjLand    0
cat__SaleCondition_Alloca     0
cat__SaleCondition_Family     0
cat__SaleCondition_Normal     0
cat__SaleCondition_Partial    0
Length: 287, dtype: int64