In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

**Mean/Mode Imputation**

In [2]:
cols_to_use = [
    "OverallQual",
    "TotalBsmtSF",
    "1stFlrSF",
    "GrLivArea",
    "WoodDeckSF",
    "BsmtUnfSF",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]

In [4]:
data = pd.read_csv("House_price.csv",usecols=cols_to_use)

In [5]:
data.head()

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,GarageYrBlt,WoodDeckSF,SalePrice
0,65.0,7,196.0,150,856,856,1710,2003.0,0,208500
1,80.0,6,0.0,284,1262,1262,1262,1976.0,298,181500
2,68.0,7,162.0,434,920,920,1786,2001.0,0,223500
3,60.0,7,0.0,540,756,961,1717,1998.0,0,140000
4,84.0,8,350.0,490,1145,1145,2198,2000.0,192,250000


In [6]:
data.isnull().mean()

LotFrontage    0.177397
OverallQual    0.000000
MasVnrArea     0.005479
BsmtUnfSF      0.000000
TotalBsmtSF    0.000000
1stFlrSF       0.000000
GrLivArea      0.000000
GarageYrBlt    0.055479
WoodDeckSF     0.000000
SalePrice      0.000000
dtype: float64

In [7]:
x_train,x_test,y_train,y_test=train_test_split(
    data.drop("SalePrice",axis=1),
    data["SalePrice"],
    test_size=0.3,
    random_state =0
)

In [8]:
x_train.isnull().sum()

LotFrontage    189
OverallQual      0
MasVnrArea       5
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
GrLivArea        0
GarageYrBlt     54
WoodDeckSF       0
dtype: int64

In [9]:
vars_to_impute = [var for var in x_train.columns if x_train[var].isnull().sum()>0]

In [10]:
vars_to_impute

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [13]:
impute_dict = x_train[vars_to_impute].median().to_dict()

In [14]:
impute_dict

{'LotFrontage': 69.0, 'MasVnrArea': 0.0, 'GarageYrBlt': 1979.0}

In [15]:
x_train.fillna(impute_dict,inplace=True)
x_test.fillna(impute_dict,inplace=True)

In [16]:
x_train.isnull().sum()

LotFrontage    0
OverallQual    0
MasVnrArea     0
BsmtUnfSF      0
TotalBsmtSF    0
1stFlrSF       0
GrLivArea      0
GarageYrBlt    0
WoodDeckSF     0
dtype: int64

**Arbitary Value Imputaion**


In [17]:
data1 = pd.read_csv("House_price.csv",usecols=cols_to_use)

In [18]:
data1.head()

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,GarageYrBlt,WoodDeckSF,SalePrice
0,65.0,7,196.0,150,856,856,1710,2003.0,0,208500
1,80.0,6,0.0,284,1262,1262,1262,1976.0,298,181500
2,68.0,7,162.0,434,920,920,1786,2001.0,0,223500
3,60.0,7,0.0,540,756,961,1717,1998.0,0,140000
4,84.0,8,350.0,490,1145,1145,2198,2000.0,192,250000


In [19]:
data1.isnull().sum()

LotFrontage    259
OverallQual      0
MasVnrArea       8
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
GrLivArea        0
GarageYrBlt     81
WoodDeckSF       0
SalePrice        0
dtype: int64

In [20]:
x_train,x_test,y_train,y_test=train_test_split(
    data1.drop("SalePrice",axis=1),
    data["SalePrice"],
    test_size= 0.3,
    random_state =0
)

In [21]:
x_train.isnull().sum()

LotFrontage    189
OverallQual      0
MasVnrArea       5
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
GrLivArea        0
GarageYrBlt     54
WoodDeckSF       0
dtype: int64

In [22]:
var_to_impute = [var for var in x_train.columns if x_train[var].isnull().sum()>0]

In [23]:
var_to_impute

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [25]:
x_train[var_to_impute].agg(["min","max"])

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt
min,21.0,0.0,1900.0
max,313.0,1600.0,2010.0


In [26]:
imputation_dict = {
    "LotFrontage": 999,
    "MasVnrArea": 1999,
    "GarageYrBlt": 2999,
}

imputation_dict

{'LotFrontage': 999, 'MasVnrArea': 1999, 'GarageYrBlt': 2999}

In [28]:
x_train.fillna(imputation_dict, inplace=True)
x_test.fillna(imputation_dict, inplace=True)

In [29]:
x_train.isnull().sum()

LotFrontage    0
OverallQual    0
MasVnrArea     0
BsmtUnfSF      0
TotalBsmtSF    0
1stFlrSF       0
GrLivArea      0
GarageYrBlt    0
WoodDeckSF     0
dtype: int64

**Categorical Imputation**

In [30]:
use_col1 =["BsmtQual", "FireplaceQu", "SalePrice"]

In [31]:
data2 = pd.read_csv("House_price.csv",usecols=use_col1)

In [32]:
data2.head()

Unnamed: 0,BsmtQual,FireplaceQu,SalePrice
0,Gd,,208500
1,Gd,TA,181500
2,Gd,TA,223500
3,TA,Gd,140000
4,Gd,TA,250000


In [35]:
X_train, X_test, y_train, y_test = train_test_split(
    data2.drop("SalePrice", axis=1),
    data2["SalePrice"],
    test_size=0.3,
    random_state=0,
)

In [36]:
X_train.isnull().sum()

BsmtQual        24
FireplaceQu    478
dtype: int64

In [37]:
X_train[["BsmtQual", "FireplaceQu"]].mode()

Unnamed: 0,BsmtQual,FireplaceQu
0,TA,Gd


In [38]:
imputation_dict = X_train[["BsmtQual", "FireplaceQu"]].mode().iloc[0].to_dict()

imputation_dict

{'BsmtQual': 'TA', 'FireplaceQu': 'Gd'}

In [39]:
X_train.fillna(imputation_dict,inplace=True)
X_test.fillna(imputation_dict,inplace=True)

In [40]:
X_train.isnull().sum()

BsmtQual       0
FireplaceQu    0
dtype: int64

**Replace Nan With "Missing" imputation**

In [41]:
imputation_dict1 = {
    "BsmtQual": "Missing",
    "FireplaceQu": "Missing",
}

imputation_dict1

{'BsmtQual': 'Missing', 'FireplaceQu': 'Missing'}

In [42]:
X_train.fillna(imputation_dict1, inplace=True)
X_test.fillna(imputation_dict1, inplace=True)

In [43]:
X_train.isnull().sum()

BsmtQual       0
FireplaceQu    0
dtype: int64

**Missing Indicator Column**

In [68]:
cols_to_use2 = [
    "OverallQual",
    "TotalBsmtSF",
    "1stFlrSF",
    "GrLivArea",
    "WoodDeckSF",
    "BsmtUnfSF",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "BsmtQual",
    "FireplaceQu",
    "SalePrice",
]

In [69]:
data3 = pd.read_csv("House_price.csv",usecols=cols_to_use2)

In [70]:
data3.head()

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtQual,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,FireplaceQu,GarageYrBlt,WoodDeckSF,SalePrice
0,65.0,7,196.0,Gd,150,856,856,1710,,2003.0,0,208500
1,80.0,6,0.0,Gd,284,1262,1262,1262,TA,1976.0,298,181500
2,68.0,7,162.0,Gd,434,920,920,1786,TA,2001.0,0,223500
3,60.0,7,0.0,TA,540,756,961,1717,Gd,1998.0,0,140000
4,84.0,8,350.0,Gd,490,1145,1145,2198,TA,2000.0,192,250000


In [71]:
X_train, X_test, y_train, y_test = train_test_split(
    data3.drop("SalePrice", axis=1),
    data3["SalePrice"],
    test_size=0.3,
    random_state=0,
)

In [72]:
vars_num = list(X_train.select_dtypes(include="number").columns)
vars_num

['LotFrontage',
 'OverallQual',
 'MasVnrArea',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 'GrLivArea',
 'GarageYrBlt',
 'WoodDeckSF']

In [73]:
vars_cat = list(X_train.select_dtypes(exclude="number").columns)
vars_cat

['BsmtQual', 'FireplaceQu']

In [74]:
impute_dict = X_train[vars_num].median().to_dict()

In [75]:
impute_dict.update(X_train[vars_cat].mode().iloc[0].to_dict())

In [76]:
impute_dict

{'LotFrontage': 69.0,
 'OverallQual': 6.0,
 'MasVnrArea': 0.0,
 'BsmtUnfSF': 486.5,
 'TotalBsmtSF': 992.0,
 '1stFlrSF': 1095.0,
 'GrLivArea': 1479.0,
 'GarageYrBlt': 1979.0,
 'WoodDeckSF': 0.0,
 'BsmtQual': 'TA',
 'FireplaceQu': 'Gd'}

In [77]:
X_train.isna().astype(int)

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtQual,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,FireplaceQu,GarageYrBlt,WoodDeckSF
64,1,0,0,0,0,0,0,0,1,0,0
682,1,0,0,0,0,0,0,0,0,0,0
960,0,0,0,0,0,0,0,0,1,1,0
1384,0,0,0,0,0,0,0,0,1,0,0
1100,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
763,0,0,0,0,0,0,0,0,0,0,0
835,0,0,0,0,0,0,0,0,1,0,0
1216,0,0,0,1,0,0,0,0,1,0,0
559,1,0,0,0,0,0,0,0,0,0,0


In [78]:
indicators = [f"{var}_na" for var in X_train.columns]
indicators

['LotFrontage_na',
 'OverallQual_na',
 'MasVnrArea_na',
 'BsmtQual_na',
 'BsmtUnfSF_na',
 'TotalBsmtSF_na',
 '1stFlrSF_na',
 'GrLivArea_na',
 'FireplaceQu_na',
 'GarageYrBlt_na',
 'WoodDeckSF_na']

In [79]:
X_train[indicators] = X_train.isna().astype(int)
X_train.head()

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtQual,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,FireplaceQu,GarageYrBlt,...,OverallQual_na,MasVnrArea_na,BsmtQual_na,BsmtUnfSF_na,TotalBsmtSF_na,1stFlrSF_na,GrLivArea_na,FireplaceQu_na,GarageYrBlt_na,WoodDeckSF_na
64,,7,573.0,Gd,318,1057,1057,2034,,1998.0,...,0,0,0,0,0,0,0,1,0,0
682,,6,0.0,Gd,288,1291,1291,1291,Gd,1996.0,...,0,0,0,0,0,0,0,0,0,0
960,50.0,5,0.0,TA,162,858,858,858,,,...,0,0,0,0,0,0,0,1,1,0
1384,60.0,6,0.0,TA,356,560,698,1258,,1939.0,...,0,0,0,0,0,0,0,1,0,0
1100,60.0,2,0.0,TA,0,290,438,438,,1930.0,...,0,0,0,0,0,0,0,1,0,0


In [80]:
X_test[indicators] = X_test.isna().astype(int)
X_test.head()

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtQual,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,FireplaceQu,GarageYrBlt,...,OverallQual_na,MasVnrArea_na,BsmtQual_na,BsmtUnfSF_na,TotalBsmtSF_na,1stFlrSF_na,GrLivArea_na,FireplaceQu_na,GarageYrBlt_na,WoodDeckSF_na
529,,6,,TA,816,2035,2515,2515,TA,1975.0,...,0,1,0,0,0,0,0,0,0,0
491,79.0,6,0.0,TA,238,806,958,1578,TA,1941.0,...,0,0,0,0,0,0,0,0,0,0
459,,5,161.0,TA,524,709,979,1203,TA,1950.0,...,0,0,0,0,0,0,0,0,0,0
279,83.0,7,299.0,Gd,768,1160,1156,2022,TA,1977.0,...,0,0,0,0,0,0,0,0,0,0
655,21.0,6,381.0,TA,525,525,525,1092,,1971.0,...,0,0,0,0,0,0,0,1,0,0


In [81]:
## After adding indicators, we can replace missing data in the original variables

X_train.fillna(impute_dict, inplace=True)
X_test.fillna(impute_dict, inplace=True)

In [82]:
X_train.isna().sum()

LotFrontage       0
OverallQual       0
MasVnrArea        0
BsmtQual          0
BsmtUnfSF         0
TotalBsmtSF       0
1stFlrSF          0
GrLivArea         0
FireplaceQu       0
GarageYrBlt       0
WoodDeckSF        0
LotFrontage_na    0
OverallQual_na    0
MasVnrArea_na     0
BsmtQual_na       0
BsmtUnfSF_na      0
TotalBsmtSF_na    0
1stFlrSF_na       0
GrLivArea_na      0
FireplaceQu_na    0
GarageYrBlt_na    0
WoodDeckSF_na     0
dtype: int64

**SIMPLE IMPUTER USING SKLEARN**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [3]:
from sklearn.model_selection import train_test_split

In [6]:
cols_to_use4 = [
    "OverallQual",
    "TotalBsmtSF",
    "1stFlrSF",
    "GrLivArea",
    "WoodDeckSF",
    "BsmtUnfSF",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]

In [8]:
df = pd.read_csv("House_price.csv",usecols=cols_to_use4)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop("SalePrice", axis=1),
    df["SalePrice"],
    test_size=0.3,
    random_state=0,
)


In [10]:
X_train.isnull().mean()

LotFrontage    0.184932
OverallQual    0.000000
MasVnrArea     0.004892
BsmtUnfSF      0.000000
TotalBsmtSF    0.000000
1stFlrSF       0.000000
GrLivArea      0.000000
GarageYrBlt    0.052838
WoodDeckSF     0.000000
dtype: float64

In [11]:
imputer = SimpleImputer(strategy='median')

In [12]:
imputer.fit(X_train)

In [13]:
imputer.statistics_

array([  69. ,    6. ,    0. ,  486.5,  992. , 1095. , 1479. , 1979. ,
          0. ])

In [14]:
X_train.median()

LotFrontage      69.0
OverallQual       6.0
MasVnrArea        0.0
BsmtUnfSF       486.5
TotalBsmtSF     992.0
1stFlrSF       1095.0
GrLivArea      1479.0
GarageYrBlt    1979.0
WoodDeckSF        0.0
dtype: float64

In [15]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [16]:
# Data is returned as Numpy array , So need to encode the train set back to a dataframe
X_train

array([[  69.,    7.,  573., ..., 2034., 1998.,  576.],
       [  69.,    6.,    0., ..., 1291., 1996.,  307.],
       [  50.,    5.,    0., ...,  858., 1979.,  117.],
       ...,
       [  68.,    6.,    0., ..., 1902., 1978.,    0.],
       [  69.,    7.,   18., ..., 1557., 2003.,  143.],
       [  58.,    7.,   30., ..., 1839., 1998.,    0.]])

In [17]:
X_train = pd.DataFrame(X_train,columns = imputer.get_feature_names_out())
X_train.head()

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,GarageYrBlt,WoodDeckSF
0,69.0,7.0,573.0,318.0,1057.0,1057.0,2034.0,1998.0,576.0
1,69.0,6.0,0.0,288.0,1291.0,1291.0,1291.0,1996.0,307.0
2,50.0,5.0,0.0,162.0,858.0,858.0,858.0,1979.0,117.0
3,60.0,6.0,0.0,356.0,560.0,698.0,1258.0,1939.0,0.0
4,60.0,2.0,0.0,0.0,290.0,438.0,438.0,1930.0,0.0


In [22]:
df1 = pd.read_csv("House_price.csv",usecols=cols_to_use4)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    df1.drop("SalePrice", axis=1),
    df1["SalePrice"],
    test_size=0.3,
    random_state=0,
)

In [27]:
# Output is directly given as Pandas DataFrame instead of Numpy array
imputer1= SimpleImputer(strategy="mean").set_output(transform="pandas")

In [28]:
imputer1.fit(X_train)

In [29]:
X_train= imputer1.transform(X_train)

In [31]:
X_train.head()

Unnamed: 0,LotFrontage,OverallQual,MasVnrArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF,GrLivArea,GarageYrBlt,WoodDeckSF
64,69.668667,7.0,573.0,318.0,1057.0,1057.0,2034.0,1998.0,576.0
682,69.668667,6.0,0.0,288.0,1291.0,1291.0,1291.0,1996.0,307.0
960,50.0,5.0,0.0,162.0,858.0,858.0,858.0,1978.012397,117.0
1384,60.0,6.0,0.0,356.0,560.0,698.0,1258.0,1939.0,0.0
1100,60.0,2.0,0.0,0.0,290.0,438.0,438.0,1930.0,0.0


**SimpleImputer - feature subsets**

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [4]:
cols_to_use4 = [
    "OverallQual",
    "TotalBsmtSF",
    "1stFlrSF",
    "GrLivArea",
    "WoodDeckSF",
    "BsmtUnfSF",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]
df2 = pd.read_csv("House_price.csv",usecols=cols_to_use4)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df2.drop("SalePrice", axis=1),
    df2["SalePrice"],
    test_size=0.3,
    random_state=0,
)

In [7]:
imputer = ColumnTransformer(transformers=[("mean_impute",SimpleImputer(strategy="mean"),["LotFrontage"]),
                                        ("median_impute",SimpleImputer(strategy="median"),["MasVnrArea", "GarageYrBlt"])],
                                         remainder="passthrough",)

In [8]:
imputer.set_output(transform="pandas")

In [9]:
imputer.fit(X_train)

In [10]:
imputer.transformers

[('mean_impute', SimpleImputer(), ['LotFrontage']),
 ('median_impute',
  SimpleImputer(strategy='median'),
  ['MasVnrArea', 'GarageYrBlt'])]

In [12]:
imputer.named_transformers_["mean_impute"].statistics_

array([69.66866747])

In [13]:
imputer.named_transformers_["median_impute"].statistics_

array([   0., 1979.])

In [14]:
X_train = imputer.transform(X_train)

In [15]:
X_test = imputer.transform(X_test)

In [16]:
X_train.head()
X_test.head()

Unnamed: 0,mean_impute__LotFrontage,median_impute__MasVnrArea,median_impute__GarageYrBlt,remainder__OverallQual,remainder__BsmtUnfSF,remainder__TotalBsmtSF,remainder__1stFlrSF,remainder__GrLivArea,remainder__WoodDeckSF
529,69.668667,0.0,1975.0,6,816,2035,2515,2515,0
491,79.0,0.0,1941.0,6,238,806,958,1578,0
459,69.668667,161.0,1950.0,5,524,709,979,1203,0
279,83.0,299.0,1977.0,7,768,1160,1156,2022,288
655,21.0,381.0,1971.0,6,525,525,525,1092,0


In [17]:
X_train.isnull().sum()

mean_impute__LotFrontage      0
median_impute__MasVnrArea     0
median_impute__GarageYrBlt    0
remainder__OverallQual        0
remainder__BsmtUnfSF          0
remainder__TotalBsmtSF        0
remainder__1stFlrSF           0
remainder__GrLivArea          0
remainder__WoodDeckSF         0
dtype: int64

In [18]:
df3 = pd.read_csv("House_price.csv",usecols=cols_to_use4)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    df3.drop("SalePrice", axis=1),
    df3["SalePrice"],
    test_size=0.3,
    random_state=0,
)

In [20]:
imputer = ColumnTransformer(transformers=[
    (
        "imputer_LotFrontAge",
         SimpleImputer(strategy="constant", fill_value=999),
         ["LotFrontage"],
    ),
    (
        "imputer_MasVnrArea",
         SimpleImputer(strategy="constant", fill_value=-10),
         ["MasVnrArea"],
    ),
    (
        "imputer_GarageYrBlt",
         SimpleImputer(strategy="constant", fill_value=1700),
         ["GarageYrBlt"],
    ),
    ],remainder="drop",
    )

In [21]:
imputer.set_output(transform="pandas")

In [22]:
imputer.fit(X_train)

In [23]:
X_train = imputer.transform(X_train)

In [24]:
X_test = imputer.transform(X_test)

In [25]:
X_train.head()

Unnamed: 0,imputer_LotFrontAge__LotFrontage,imputer_MasVnrArea__MasVnrArea,imputer_GarageYrBlt__GarageYrBlt
64,999.0,573.0,1998.0
682,999.0,0.0,1996.0
960,50.0,0.0,1700.0
1384,60.0,0.0,1939.0
1100,60.0,0.0,1930.0


In [26]:
cols_to_use5 = [
    "BsmtQual",
    "FireplaceQu",
    "MSZoning",
    "BsmtUnfSF",
    "LotFrontage",
    "MasVnrArea",
    "Street",
    "Alley",
    "SalePrice",
]

In [27]:
df4 = pd.read_csv("House_price.csv",usecols=cols_to_use5)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    df4.drop("SalePrice", axis=1),
    df4["SalePrice"],
    test_size=0.3,
    random_state=0,
)

In [29]:
features_numeric = ["BsmtUnfSF","LotFrontage","MasVnrArea",]
features_categoric = ["BsmtQual", "FireplaceQu", "MSZoning", "Street", "Alley"]

In [30]:
imputer = ColumnTransformer(transformers=[
                             ("numeric imputer",SimpleImputer(strategy="mean"),features_numeric),
                             ("category imputer",SimpleImputer(strategy="most_frequent"),features_categoric),
                            ])

In [32]:
imputer.set_output(transform="pandas")

In [33]:
imputer.fit(X_train)

In [34]:
X_train = imputer.transform(X_train)

In [35]:
X_test = imputer.transform(X_test)

In [36]:
X_train.isnull().sum()

numeric imputer__BsmtUnfSF       0
numeric imputer__LotFrontage     0
numeric imputer__MasVnrArea      0
category imputer__BsmtQual       0
category imputer__FireplaceQu    0
category imputer__MSZoning       0
category imputer__Street         0
category imputer__Alley          0
dtype: int64

**Add Missing Indicators**

In [37]:
import pandas as pd

from sklearn.impute import SimpleImputer, MissingIndicator

from sklearn.model_selection import train_test_split

In [38]:
cols_use = [
    "BsmtQual",
    "FireplaceQu",
    "MSZoning",
    "BsmtUnfSF",
    "LotFrontage",
    "MasVnrArea",
    "Street",
    "Alley",
    "SalePrice",
]

In [39]:
df5 = pd.read_csv("House_price.csv",usecols=cols_use)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    df5.drop("SalePrice", axis=1),
    df5["SalePrice"],
    test_size=0.3,
    random_state=0,
)

In [41]:
X_train.isnull().mean()

MSZoning       0.000000
LotFrontage    0.184932
Street         0.000000
Alley          0.939335
MasVnrArea     0.004892
BsmtQual       0.023483
BsmtUnfSF      0.000000
FireplaceQu    0.467710
dtype: float64

In [42]:
indicator = MissingIndicator(error_on_new=True,features="missing-only")
indicator.fit(X_train)

In [43]:
X_train.columns[indicator.features_]

Index(['LotFrontage', 'Alley', 'MasVnrArea', 'BsmtQual', 'FireplaceQu'], dtype='object')

In [44]:
tmp = indicator.transform(X_train)
tmp

array([[ True,  True, False, False,  True],
       [ True,  True, False, False, False],
       [False,  True, False, False,  True],
       ...,
       [False,  True, False,  True,  True],
       [ True,  True, False, False, False],
       [False,  True, False, False,  True]])

In [45]:
indicator.get_feature_names_out()

array(['missingindicator_LotFrontage', 'missingindicator_Alley',
       'missingindicator_MasVnrArea', 'missingindicator_BsmtQual',
       'missingindicator_FireplaceQu'], dtype=object)

In [46]:
X_train = pd.concat(
    [ X_train.reset_index(),pd.DataFrame(tmp, columns=indicator.get_feature_names_out()),
    ],axis=1,
)

In [48]:
X_train.head()

Unnamed: 0,index,MSZoning,LotFrontage,Street,Alley,MasVnrArea,BsmtQual,BsmtUnfSF,FireplaceQu,missingindicator_LotFrontage,missingindicator_Alley,missingindicator_MasVnrArea,missingindicator_BsmtQual,missingindicator_FireplaceQu
0,64,RL,,Pave,,573.0,Gd,318,,True,True,False,False,True
1,682,RL,,Pave,,0.0,Gd,288,Gd,True,True,False,False,False
2,960,RL,50.0,Pave,,0.0,TA,162,,False,True,False,False,True
3,1384,RL,60.0,Pave,,0.0,TA,356,,False,True,False,False,True
4,1100,RL,60.0,Pave,,0.0,TA,0,,False,True,False,False,True


In [49]:
tmp = indicator.transform(X_test)
X_test = pd.concat(
    [X_test.reset_index(),pd.DataFrame(tmp, columns=indicator.get_feature_names_out()),
    ],axis=1,
)


**Add indicators with the SimpleImputer**

In [50]:
X_train, X_test, y_train, y_test = train_test_split(
    df5.drop("SalePrice", axis=1),
    df5["SalePrice"],
    test_size=0.3,
    random_state=0,
)

In [51]:
imputer = SimpleImputer(
    strategy="most_frequent",
    add_indicator=True,
).set_output(transform="pandas")

In [52]:
imputer.fit(X_train)

In [53]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)
X_train.head()

Unnamed: 0,MSZoning,LotFrontage,Street,Alley,MasVnrArea,BsmtQual,BsmtUnfSF,FireplaceQu,missingindicator_LotFrontage,missingindicator_Alley,missingindicator_MasVnrArea,missingindicator_BsmtQual,missingindicator_FireplaceQu
64,RL,60.0,Pave,Pave,573.0,Gd,318,Gd,True,True,False,False,True
682,RL,60.0,Pave,Pave,0.0,Gd,288,Gd,True,True,False,False,False
960,RL,50.0,Pave,Pave,0.0,TA,162,Gd,False,True,False,False,True
1384,RL,60.0,Pave,Pave,0.0,TA,356,Gd,False,True,False,False,True
1100,RL,60.0,Pave,Pave,0.0,TA,0,Gd,False,True,False,False,True


**GRID_SEARCH_CV**

In [1]:
import pandas as pd
import numpy as np
# import classes for imputation
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# import classes for modelling
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
data=pd.read_csv("House_price.csv")

In [3]:
data.shape

(1460, 81)

In [4]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
feature_cat = [c for c in data.columns if data[c].dtype=="O"]
feature_num = [c for c in data.columns if data[c].dtype!="O" and c !="SalePrice"]

In [6]:
data[feature_num].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 37 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   OverallQual    1460 non-null   int64  
 5   OverallCond    1460 non-null   int64  
 6   YearBuilt      1460 non-null   int64  
 7   YearRemodAdd   1460 non-null   int64  
 8   MasVnrArea     1452 non-null   float64
 9   BsmtFinSF1     1460 non-null   int64  
 10  BsmtFinSF2     1460 non-null   int64  
 11  BsmtUnfSF      1460 non-null   int64  
 12  TotalBsmtSF    1460 non-null   int64  
 13  1stFlrSF       1460 non-null   int64  
 14  2ndFlrSF       1460 non-null   int64  
 15  LowQualFinSF   1460 non-null   int64  
 16  GrLivArea      1460 non-null   int64  
 17  BsmtFullBath   1460 non-null   int64  
 18  BsmtHalf

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),
    data["SalePrice"],
    test_size=0.3,
    random_state=0,
)
X_train.shape, X_test.shape

((1022, 80), (438, 80))

We create the preprocessing pipelines for both numerical and categorical data

In [9]:
num_transformer = Pipeline(
    steps=[("imputer",SimpleImputer(strategy="median")),("scaler",StandardScaler())])
cat_transformer = Pipeline(
    steps=[("imputer",SimpleImputer(strategy="constant",fill_value="missing")),("one-hot",OneHotEncoder(handle_unknown="ignore"))])
preprocessor = ColumnTransformer(transformers=[
    ("numerical",num_transformer,feature_num),
    ("categorical",cat_transformer,feature_cat)
])

In [10]:
#Append classifier to preprocessing pipeline. Now we have a full prediction pipeline
pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", Lasso(max_iter=2000))]
)

In [11]:
param_grid = {
    "preprocessor__numerical__imputer__strategy": ["mean", "median"],
    "preprocessor__categorical__imputer__strategy": ["most_frequent", "constant"],
    "regressor__alpha": [10, 100, 200],
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring="r2")

In [12]:
grid_search.fit(X_train, y_train)

In [13]:
grid_search.best_estimator_

In [14]:
#best fit parameters
grid_search.best_params_

{'preprocessor__categorical__imputer__strategy': 'constant',
 'preprocessor__numerical__imputer__strategy': 'median',
 'regressor__alpha': 100}

In [15]:
# here we can see all the combinations evaluated during the grid search
grid_search.cv_results_["params"]

[{'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'mean',
  'regressor__alpha': 10},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'mean',
  'regressor__alpha': 100},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'mean',
  'regressor__alpha': 200},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'median',
  'regressor__alpha': 10},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'median',
  'regressor__alpha': 100},
 {'preprocessor__categorical__imputer__strategy': 'most_frequent',
  'preprocessor__numerical__imputer__strategy': 'median',
  'regressor__alpha': 200},
 {'preprocessor__categorical__imputer__strategy': 'constant',
  'preprocessor__numerical__

In [17]:
# and here the scores for each of one of the above combinations
grid_search.cv_results_["mean_test_score"]

array([0.84746254, 0.86624908, 0.86552764, 0.84739594, 0.86621021,
       0.8654755 , 0.84814964, 0.86646886, 0.86525292, 0.8481309 ,
       0.86651035, 0.86523714])

In [19]:
print(
    (
        "best linear regression from grid search: %.3f"
        % grid_search.score(X_train, y_train)
    )
)

best linear regression from grid search: 0.933


In [18]:
# and finally let's check the performance over the test set
print(
    (
        "best linear regression from grid search: %.3f"
        % grid_search.score(X_test, y_test)
    )
)

best linear regression from grid search: 0.738


In [None]:
#This model overfits to the train set, look at the r2 of 0.93 obtained for the train set vs 0.738 for the test set.