### Adding a variable to capture NaN
#### either replace the missing value with 1s and 0s or replace the NaN with a new category name

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("D:\Rptech\Technical\DS_stats\dataset\house_price_loan.csv", usecols= ['BsmtQual', 'FireplaceQu', 'GarageType','SalePrice'])
df.head()
# df.isnull().sum()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice
0,Gd,,Attchd,208500
1,Gd,TA,Attchd,181500
2,Gd,TA,Attchd,223500
3,TA,Gd,Detchd,140000
4,Gd,TA,Attchd,250000


In [3]:
#creating another feature to indicate the null value
df['BsmtQual_Var']=np.where(df['BsmtQual'].isnull(),1,0) # if null replace with 1 else 0
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice,BsmtQual_Var
0,Gd,,Attchd,208500,0
1,Gd,TA,Attchd,181500,0
2,Gd,TA,Attchd,223500,0
3,TA,Gd,Detchd,140000,0
4,Gd,TA,Attchd,250000,0


In [4]:
# replace the null value with the most repeated category in BsmtQual
# use mode to find the most repeagted category
frequent = df['BsmtQual'].mode()[0] 
frequent

'TA'

In [5]:
df['BsmtQual'].fillna(frequent, inplace=True)
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice,BsmtQual_Var
0,Gd,,Attchd,208500,0
1,Gd,TA,Attchd,181500,0
2,Gd,TA,Attchd,223500,0
3,TA,Gd,Detchd,140000,0
4,Gd,TA,Attchd,250000,0


In [6]:
freq_fire = df['FireplaceQu'].mode()[0]
freq_fire

'Gd'

In [7]:
df['FireplaceQu_var']=np.where(df['FireplaceQu'].isnull(),1,0)
df['FireplaceQu'].fillna(freq_fire, inplace=True)
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice,BsmtQual_Var,FireplaceQu_var
0,Gd,Gd,Attchd,208500,0,1
1,Gd,TA,Attchd,181500,0,0
2,Gd,TA,Attchd,223500,0,0
3,TA,Gd,Detchd,140000,0,0
4,Gd,TA,Attchd,250000,0,0


In [8]:
df.isnull().sum()

BsmtQual            0
FireplaceQu         0
GarageType         81
SalePrice           0
BsmtQual_Var        0
FireplaceQu_var     0
dtype: int64

### Technique 2: replace the NaN with a new category ex: "Missing"

In [4]:
def impute_nan(df, variable):
    df[variable+"_newvar"] = np.where(df[variable].isnull(),"Missing", df[variable])

In [5]:
for feature in ['BsmtQual', 'FireplaceQu', 'GarageType']:
    impute_nan(df, feature)

In [7]:
df.head()

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice,BsmtQual_newvar,FireplaceQu_newvar,GarageType_newvar
0,Gd,,Attchd,208500,Gd,Missing,Attchd
1,Gd,TA,Attchd,181500,Gd,TA,Attchd
2,Gd,TA,Attchd,223500,Gd,TA,Attchd
3,TA,Gd,Detchd,140000,TA,Gd,Detchd
4,Gd,TA,Attchd,250000,Gd,TA,Attchd


In [8]:
# drop the old features as new feature is created
df = df.drop(['BsmtQual', 'FireplaceQu', 'GarageType'], axis=1)

In [9]:
df.head()

Unnamed: 0,SalePrice,BsmtQual_newvar,FireplaceQu_newvar,GarageType_newvar
0,208500,Gd,Missing,Attchd
1,181500,Gd,TA,Attchd
2,223500,Gd,TA,Attchd
3,140000,TA,Gd,Detchd
4,250000,Gd,TA,Attchd
