In [651]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [652]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [653]:
train.shape , test.shape

((1460, 81), (1459, 80))

###### LotFrontage: Linear feet of street connected to property
###### Alley: Type of alley access
###### FireplaceQu: Fireplace quality
###### PoolQC: Pool quality
###### Fence: Fence quality
###### MiscFeature: Miscellaneous feature not covered in other categories

In [654]:
train.isnull().sum()[train.isnull().sum()>1000]

Alley          1369
PoolQC         1453
Fence          1179
MiscFeature    1406
dtype: int64

In [655]:
test.isnull().sum()[test.isnull().sum()>1000]

Alley          1352
PoolQC         1456
Fence          1169
MiscFeature    1408
dtype: int64

# IMPUTATION

In [656]:
train = train.drop(['Alley','PoolQC','Fence','MiscFeature'] , axis = 1)
test = test.drop(['Alley','PoolQC','Fence','MiscFeature'] , axis = 1)

In [658]:
num_imp_median = SimpleImputer(strategy='median')
# train[['LotFrontage','MasVnrArea']] = num_imp_median.fit_transform(train[['LotFrontage','MasVnrArea']])
# test[['LotFrontage','MasVnrArea']] = num_imp_median.transform(test[['LotFrontage','MasVnrArea']])

train[['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea']] = num_imp_median.fit_transform(train[['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea']])
test[['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea']] = num_imp_median.transform(test[['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea']])

# num_imp_mean = SimpleImputer(strategy='mean')
# train[['GarageYrBlt']] = num_imp_mean.fit_transform(train[['GarageYrBlt']])
# test[['GarageYrBlt']] = num_imp_mean.transform(test[['GarageYrBlt']])

In [659]:
cat_col = SimpleImputer(strategy='most_frequent')
# train[['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
#        'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType',
#        'GarageFinish', 'GarageQual', 'GarageCond']] = cat_col.fit_transform(train[['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
#        'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType',
#        'GarageFinish', 'GarageQual', 'GarageCond']])

# test[['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
#        'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish',
#        'GarageQual', 'GarageCond','Utilities']] = cat_col.transform(test[['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
#        'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish',
#        'GarageQual', 'GarageCond','Utilities']])

train[['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType','Electrical']] = cat_col.fit_transform(train[['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType','Electrical']])
test[['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType','Electrical']] = cat_col.fit_transform(test[['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'SaleType','Electrical']])

In [680]:
low_cardinality_cols_train = [cname for cname in train.columns if train[cname].nunique() < 5 and train[cname].dtype == "object"]
low_cardinality_cols_test = [cname for cname in test.columns if train[cname].nunique() < 5 and test[cname].dtype == "object"]
num_col_train = train.select_dtypes(include = 'number').columns
num_col_test = test.select_dtypes(include = 'number').columns
len(low_cardinality_cols_train),len(low_cardinality_cols_test) , len(num_col_train) , len(num_col_test)

(14, 14, 38, 37)

In [681]:
total_train_col = np.concatenate((low_cardinality_cols_train,num_col_train))
total_test_col = np.concatenate((low_cardinality_cols_test,num_col_test))

In [682]:
ftrain = train[total_train_col]
ftest = test[total_test_col]
ftrain.shape , ftest.shape 

((1460, 52), (1459, 51))

# ENCODING

In [684]:
ftest.select_dtypes(include='object').nunique().sum()

46

In [None]:
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
ohe = 