# Data Cleaning
## Missing value imputation using Scikit-Learn
### Different strategy for different variables(Numerical & Categorical) with Scikit-Learn

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv(r"C:\Users\akash\jupyter notebook\ML_Indian_AI_Course\Data Cleaning\datasets\house-prices-advanced-regression-techniques\train.csv")
test = pd.read_csv(r"C:\Users\akash\jupyter notebook\ML_Indian_AI_Course\Data Cleaning\datasets\house-prices-advanced-regression-techniques\test.csv")
print("Shape of train df = ", train.shape)
print("Shape of test df = ", test.shape)

Shape of train df =  (1460, 81)
Shape of test df =  (1459, 80)


In [3]:
X_train = train.drop(columns='SalePrice', axis = 1)
y_train = train['SalePrice']
X_test = test.copy()
print("Shape of X_train df = ", X_train.shape)
print("Shape of y_train df = ", y_train.shape)
print("Shape of X_test df = ", X_test.shape)

Shape of X_train df =  (1460, 80)
Shape of y_train df =  (1460,)
Shape of X_test df =  (1459, 80)


# Missing value imputation

In [4]:
isnull_sum = X_train.isnull().sum()
isnull_sum

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 80, dtype: int64

In [5]:
# finding the numerical variable which have mising value
num_vars = X_train.select_dtypes(include=["int64", "float64"]).columns
num_vars_miss = [var for var in num_vars if isnull_sum[var]>0]

In [6]:
num_vars_miss

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [7]:
# finding the categorical variable which have mising value
cat_vars = X_train.select_dtypes(include=["O"]).columns
cat_vars_miss = [var for var in cat_vars if isnull_sum[var]>0]
cat_vars_miss

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [8]:
num_var_mean = ["LotFrontage"]
num_var_median = ['MasVnrArea', 'GarageYrBlt']
cat_vars_mode = ['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',]
cat_vars_missing = ['GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [9]:
num_var_mean_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])
num_var_median_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
cat_vars_mode_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent"))])
cat_vars_missing_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="constant", fill_value="missing"))])

In [10]:
preprocessor = ColumnTransformer(transformers=[("mean_imputer", num_var_mean_imputer, num_var_mean),
                                ("median_imputer", num_var_median_imputer, num_var_median),
                               ("mode_imputer", cat_vars_mode_imputer, cat_vars_mode),
                               ("missing_imputer", cat_vars_missing_imputer, cat_vars_missing)])

In [11]:
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('missing_imputer',
                                 Pipe

In [12]:
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('missing_im

In [13]:
preprocessor.named_transformers_['mean_imputer'].named_steps['imputer'].statistics_

array([70.04995837])

In [14]:
train['LotFrontage'].mean()

70.04995836802665

In [15]:
preprocessor.named_transformers_['median_imputer'].named_steps['imputer'].statistics_

array([   0., 1980.])

In [16]:
preprocessor.named_transformers_['mode_imputer'].named_steps['imputer'].statistics_

array(['Grvl', 'None', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Gd'],
      dtype=object)

In [17]:
preprocessor.named_transformers_['missing_imputer'].named_steps['imputer'].statistics_

array(['missing', 'missing', 'missing', 'missing', 'missing', 'missing',
       'missing'], dtype=object)

In [18]:
X_train_clean = preprocessor.transform(X_train)
X_test_clean = preprocessor.transform(X_test)

In [19]:
X_train_clean

array([[65.0, 196.0, 2003.0, ..., 'missing', 'missing', 'missing'],
       [80.0, 0.0, 1976.0, ..., 'missing', 'missing', 'missing'],
       [68.0, 162.0, 2001.0, ..., 'missing', 'missing', 'missing'],
       ...,
       [66.0, 0.0, 1941.0, ..., 'missing', 'GdPrv', 'Shed'],
       [68.0, 0.0, 1950.0, ..., 'missing', 'missing', 'missing'],
       [75.0, 0.0, 1965.0, ..., 'missing', 'missing', 'missing']],
      dtype=object)

In [20]:
preprocessor.transformers_

[('mean_imputer',
  Pipeline(steps=[('imputer', SimpleImputer())]),
  ['LotFrontage']),
 ('median_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
  ['MasVnrArea', 'GarageYrBlt']),
 ('mode_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['Alley',
   'MasVnrType',
   'BsmtQual',
   'BsmtCond',
   'BsmtExposure',
   'BsmtFinType1',
   'BsmtFinType2',
   'Electrical',
   'FireplaceQu']),
 ('missing_imputer',
  Pipeline(steps=[('imputer',
                   SimpleImputer(fill_value='missing', strategy='constant'))]),
  ['GarageType',
   'GarageFinish',
   'GarageQual',
   'GarageCond',
   'PoolQC',
   'Fence',
   'MiscFeature']),
 ('remainder',
  'drop',
  [0,
   1,
   2,
   4,
   5,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   27,
   28,
   29,
   34,
   36,
   37,
   38,
   39,
   40,
   41,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   

### In Above : remainder, drop index showing because, those are list of columns which are dropped due to contain no missing value. If we don't want to drop, we assign value 'passthrough' to remainder.

In [21]:
X_train_clean_miss_var = pd.DataFrame(X_train_clean, columns=num_var_mean + num_var_median + cat_vars_mode + cat_vars_missing)

In [22]:
X_train_clean_miss_var.head(10)

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,65.0,196.0,2003.0,Grvl,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,missing,missing,missing
1,80.0,0.0,1976.0,Grvl,,Gd,TA,Gd,ALQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,missing
2,68.0,162.0,2001.0,Grvl,BrkFace,Gd,TA,Mn,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,missing
3,60.0,0.0,1998.0,Grvl,,TA,Gd,No,ALQ,Unf,SBrkr,Gd,Detchd,Unf,TA,TA,missing,missing,missing
4,84.0,350.0,2000.0,Grvl,BrkFace,Gd,TA,Av,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,missing
5,85.0,0.0,1993.0,Grvl,,Gd,TA,No,GLQ,Unf,SBrkr,Gd,Attchd,Unf,TA,TA,missing,MnPrv,Shed
6,75.0,186.0,2004.0,Grvl,Stone,Ex,TA,Av,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,missing,missing,missing
7,70.049958,240.0,1973.0,Grvl,Stone,Gd,TA,Mn,ALQ,BLQ,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,Shed
8,51.0,0.0,1931.0,Grvl,,TA,TA,No,Unf,Unf,FuseF,TA,Detchd,Unf,Fa,TA,missing,missing,missing
9,50.0,0.0,1939.0,Grvl,,TA,TA,No,GLQ,Unf,SBrkr,TA,Attchd,RFn,Gd,TA,missing,missing,missing


In [23]:
X_train_clean_miss_var.isnull().sum().sum()

0

In [24]:
# before implement pipeline
train["Alley"].value_counts()

Grvl    50
Pave    41
Name: Alley, dtype: int64

In [25]:
# after implement pipeline
X_train_clean_miss_var["Alley"].value_counts()

Grvl    1419
Pave      41
Name: Alley, dtype: int64

In [26]:
# after implement pipeline
X_train_clean_miss_var["MiscFeature"].value_counts()

missing    1406
Shed         49
Othr          2
Gar2          2
TenC          1
Name: MiscFeature, dtype: int64

# Home Work

## Create Clean X_train DataFrame with call variables 

In [27]:
# no missing values variables index
remainder_vars_index = [0,
   1,
   2,
   4,
   5,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   27,
   28,
   29,
   34,
   36,
   37,
   38,
   39,
   40,
   41,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   56,
   61,
   62,
   65,
   66,
   67,
   68,
   69,
   70,
   71,
   75,
   76,
   77,
   78,
   79]

In [28]:
# get no missing values variables name using there index
remainder_vars = [isnull_sum.keys()[var_index] for var_index in remainder_vars_index]
remainder_vars

['Id',
 'MSSubClass',
 'MSZoning',
 'LotArea',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SaleType',
 'SaleCondition']

In [29]:
len(remainder_vars)

61

In [30]:
# concatinate X_train_clean_miss_var df and remainder_vars
X_train =  pd.concat([X_train_clean_miss_var,train[remainder_vars]], axis=1)

In [31]:
X_train.shape

(1460, 80)

In [32]:
X_train.isnull().sum().sum()

0

## Test Dataframe

In [33]:
# Create test DataFrame with missing value imputed variables
X_test_clean_miss_var = pd.DataFrame(X_test_clean, columns=num_var_mean + num_var_median + cat_vars_mode + cat_vars_missing)
X_test_clean_miss_var.shape

(1459, 19)

In [34]:
X_test_clean_miss_var.isnull().sum().sum()

0

In [35]:
# concatinate X_test_clean_miss_var df and remainder_vars
X_test =  pd.concat([X_test_clean_miss_var, test[remainder_vars]], axis=1)
X_test.shape

(1459, 80)

In [36]:
X_test.isnull().sum().sum()

22

In [37]:
isnull_sum_test = X_test.isnull().sum()
isnull_sum_test

LotFrontage      0
MasVnrArea       0
GarageYrBlt      0
Alley            0
MasVnrType       0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         1
SaleCondition    0
Length: 80, dtype: int64

In [38]:
# finding the numerical variable which have mising value
num_vars_test = X_test.select_dtypes(include=["int64", "float64"]).columns
num_vars_miss_test = [var for var in num_vars_test if isnull_sum_test[var]>0]
num_vars_miss_test

['BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'GarageCars',
 'GarageArea']

In [39]:
# finding the categorical variable which have mising value
cat_vars_test = X_test.select_dtypes(include=["O"]).columns
cat_vars_miss_test = [var for var in cat_vars_test if isnull_sum_test[var]>0]
cat_vars_miss_test

['MSZoning',
 'Utilities',
 'Exterior1st',
 'Exterior2nd',
 'KitchenQual',
 'Functional',
 'SaleType']

In [40]:
num_var_mean_test = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
num_var_median_test = ['GarageCars', 'GarageArea']
cat_vars_mode_test = ['MSZoning',
 'Utilities',
 'Exterior1st',
 'Exterior2nd']
cat_vars_missing_test= ['KitchenQual',
 'Functional',
 'SaleType']

In [41]:
num_var_mean_test_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])
num_var_median_test_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
cat_vars_mode_test_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent"))])
cat_vars_missing_test_imputer = Pipeline(steps=[("imputer", SimpleImputer(strategy="constant", fill_value="missing"))])

In [42]:
preprocessor_test = ColumnTransformer(transformers=[("mean_imputer", num_var_mean_test_imputer, num_var_mean_test),
                                ("median_imputer", num_var_median_test_imputer, num_var_median_test),
                               ("mode_imputer", cat_vars_mode_test_imputer, cat_vars_mode_test),
                               ("missing_imputer", cat_vars_missing_test_imputer, cat_vars_missing_test)])

In [43]:
preprocessor_test.fit(X_test)

ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                                  'TotalBsmtSF', 'BsmtFullBath',
                                  'BsmtHalfBath']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['GarageCars', 'GarageArea']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['MSZoning', 'Utilities', 'Exterior1st',
                                  'Exterior2nd']),
                                ('missing_imputer',
                                 P

In [44]:
preprocessor_test.transform

<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                                  'TotalBsmtSF', 'BsmtFullBath',
                                  'BsmtHalfBath']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['GarageCars', 'GarageArea']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['MSZoning', 'Utilities', 'Exterior1st',
                                  'Exterior2nd']),
                                ('missing

In [45]:
preprocessor_test.named_transformers_["mean_imputer"].named_steps["imputer"].statistics_

array([4.39203704e+02, 5.26193416e+01, 5.54294925e+02, 1.04611797e+03,
       4.34454358e-01, 6.52024708e-02])

In [46]:
test["BsmtFinSF1"].mean()

439.2037037037037

In [47]:
preprocessor_test.named_transformers_["mode_imputer"].named_steps["imputer"].statistics_

array(['RL', 'AllPub', 'VinylSd', 'VinylSd'], dtype=object)

In [48]:
X_test_clean = preprocessor_test.transform(X_test)

In [49]:
X_test_clean

array([[468.0, 144.0, 270.0, ..., 'TA', 'Typ', 'WD'],
       [923.0, 0.0, 406.0, ..., 'Gd', 'Typ', 'WD'],
       [791.0, 0.0, 137.0, ..., 'TA', 'Typ', 'WD'],
       ...,
       [1224.0, 0.0, 0.0, ..., 'TA', 'Typ', 'WD'],
       [337.0, 0.0, 575.0, ..., 'TA', 'Typ', 'WD'],
       [758.0, 0.0, 238.0, ..., 'TA', 'Typ', 'WD']], dtype=object)

In [50]:
preprocessor_test.transformers_

[('mean_imputer',
  Pipeline(steps=[('imputer', SimpleImputer())]),
  ['BsmtFinSF1',
   'BsmtFinSF2',
   'BsmtUnfSF',
   'TotalBsmtSF',
   'BsmtFullBath',
   'BsmtHalfBath']),
 ('median_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
  ['GarageCars', 'GarageArea']),
 ('mode_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd']),
 ('missing_imputer',
  Pipeline(steps=[('imputer',
                   SimpleImputer(fill_value='missing', strategy='constant'))]),
  ['KitchenQual', 'Functional', 'SaleType']),
 ('remainder',
  'drop',
  [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   22,
   23,
   24,
   25,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   42,
   43,
   44,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   58,
   59,
   60,
   

In [51]:
X_test_clean_miss_var = pd.DataFrame(X_test_clean, columns = num_var_mean_test + num_var_median_test + cat_vars_mode_test + cat_vars_missing_test)

In [52]:
X_test_clean_miss_var.head()

Unnamed: 0,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageCars,GarageArea,MSZoning,Utilities,Exterior1st,Exterior2nd,KitchenQual,Functional,SaleType
0,468.0,144.0,270.0,882.0,0.0,0.0,1.0,730.0,RH,AllPub,VinylSd,VinylSd,TA,Typ,WD
1,923.0,0.0,406.0,1329.0,0.0,0.0,1.0,312.0,RL,AllPub,Wd Sdng,Wd Sdng,Gd,Typ,WD
2,791.0,0.0,137.0,928.0,0.0,0.0,2.0,482.0,RL,AllPub,VinylSd,VinylSd,TA,Typ,WD
3,602.0,0.0,324.0,926.0,0.0,0.0,2.0,470.0,RL,AllPub,VinylSd,VinylSd,Gd,Typ,WD
4,263.0,0.0,1017.0,1280.0,0.0,0.0,2.0,506.0,RL,AllPub,HdBoard,HdBoard,Gd,Typ,WD


In [53]:
X_test_clean_miss_var.sample(10)

Unnamed: 0,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageCars,GarageArea,MSZoning,Utilities,Exterior1st,Exterior2nd,KitchenQual,Functional,SaleType
476,138.0,0.0,778.0,916.0,0.0,0.0,2.0,386.0,RL,AllPub,HdBoard,HdBoard,TA,Typ,WD
1313,276.0,0.0,936.0,1212.0,1.0,0.0,1.0,240.0,RM,AllPub,MetalSd,MetalSd,TA,Typ,WD
960,595.0,354.0,156.0,1105.0,1.0,0.0,1.0,280.0,RL,AllPub,VinylSd,VinylSd,TA,Min1,WD
246,0.0,0.0,1436.0,1436.0,0.0,0.0,2.0,529.0,FV,AllPub,VinylSd,VinylSd,Gd,Typ,New
1120,967.0,0.0,106.0,1073.0,1.0,0.0,2.0,720.0,C (all),AllPub,MetalSd,MetalSd,TA,Typ,WD
1015,485.0,0.0,187.0,672.0,1.0,0.0,1.0,240.0,RL,AllPub,Plywood,Plywood,TA,Typ,WD
510,1082.0,0.0,538.0,1620.0,1.0,0.0,4.0,1150.0,RL,AllPub,CemntBd,CmentBd,Ex,Typ,WD
277,601.0,216.0,158.0,975.0,0.0,1.0,2.0,524.0,RL,AllPub,VinylSd,VinylSd,Gd,Typ,WD
1373,0.0,0.0,945.0,945.0,0.0,0.0,2.0,638.0,RL,AllPub,VinylSd,VinylSd,Gd,Typ,New
918,915.0,0.0,93.0,1008.0,1.0,0.0,2.0,570.0,RL,AllPub,VinylSd,VinylSd,TA,Typ,WD


In [54]:
X_test_clean_miss_var.isnull().sum().sum()

0

In [57]:
# before pipelining
test["MSZoning"].value_counts()

RL         1114
RM          242
FV           74
C (all)      15
RH           10
Name: MSZoning, dtype: int64

In [58]:
# after pipelining
X_test_clean_miss_var["MSZoning"].value_counts()

RL         1118
RM          242
FV           74
C (all)      15
RH           10
Name: MSZoning, dtype: int64