### Regularization Project - Backward Elimination

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

### Step 1: Reading the dataset

In [2]:
import pandas as pd
df = pd.read_csv("training_set.csv", na_values=['', 'NA'], keep_default_na=False)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Step 2: Basic data quality checks

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
n = df.isna().sum()
n[n>0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [5]:
df.duplicated().sum()

0

### Step 3: Seperate X and Y (SalePrice)

In [6]:
X = df.drop(columns=['Id','SalePrice'])
Y = df[['SalePrice']]

In [7]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [8]:
Y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


### Step 4: Feature Selection Pipeline

In [9]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [10]:
cat

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [11]:
con

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [12]:
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [13]:
num_pipe1 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='mean')),
                   ('scaler', StandardScaler())])

In [14]:
cat_pipe1 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='most_frequent')),
                            ('ordinal', OrdinalEncoder())])

In [15]:
pre1 = ColumnTransformer([('num', num_pipe1, con),
                        ('cat', cat_pipe1, cat)]).set_output(transform='pandas')

In [16]:
X_pre1 = pre1.fit_transform(X)
X_pre1.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,0.073375,-0.229372,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.511418,0.575425,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57441,1.171992,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
2,0.073375,-0.09311,0.07348,0.651479,-0.5172,0.984752,0.830215,0.32306,0.092907,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
3,0.309859,-0.456474,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57441,-0.499274,-0.288653,...,5.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,0.0
4,0.073375,0.633618,0.375148,1.374795,-0.5172,0.951632,0.733308,1.36457,0.463568,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

In [18]:
lr = LinearRegression()
sel = SequentialFeatureSelector(estimator=lr, n_features_to_select='auto', direction='backward')

sel.fit_transform(X_pre1, Y)
imp_cols = sel.get_feature_names_out()

In [19]:
imp_cols

array(['num__MSSubClass', 'num__LotArea', 'num__OverallQual',
       'num__OverallCond', 'num__YearBuilt', 'num__MasVnrArea',
       'num__BsmtUnfSF', 'num__LowQualFinSF', 'num__GrLivArea',
       'num__BsmtFullBath', 'num__BsmtHalfBath', 'num__Fireplaces',
       'num__GarageCars', 'num__WoodDeckSF', 'num__EnclosedPorch',
       'num__ScreenPorch', 'num__PoolArea', 'num__YrSold',
       'cat__MSZoning', 'cat__Street', 'cat__LandContour',
       'cat__BldgType', 'cat__HouseStyle', 'cat__RoofMatl',
       'cat__Exterior1st', 'cat__MasVnrType', 'cat__ExterQual',
       'cat__ExterCond', 'cat__Foundation', 'cat__BsmtQual',
       'cat__BsmtCond', 'cat__BsmtExposure', 'cat__BsmtFinType2',
       'cat__CentralAir', 'cat__KitchenQual', 'cat__Functional',
       'cat__GarageCond', 'cat__PavedDrive', 'cat__MiscFeature',
       'cat__SaleCondition'], dtype=object)

In [20]:
imp_cols[0].split('__')

['num', 'MSSubClass']

In [21]:
sel_cols = []
for i in imp_cols:
    s = i.split('__')[1]
    sel_cols.append(s)

In [22]:
sel_cols

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'MasVnrArea',
 'BsmtUnfSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'EnclosedPorch',
 'ScreenPorch',
 'PoolArea',
 'YrSold',
 'MSZoning',
 'Street',
 'LandContour',
 'BldgType',
 'HouseStyle',
 'RoofMatl',
 'Exterior1st',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType2',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'GarageCond',
 'PavedDrive',
 'MiscFeature',
 'SaleCondition']

In [23]:
X_sels = X[sel_cols]
X_sels.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,BsmtUnfSF,LowQualFinSF,GrLivArea,BsmtFullBath,...,BsmtCond,BsmtExposure,BsmtFinType2,CentralAir,KitchenQual,Functional,GarageCond,PavedDrive,MiscFeature,SaleCondition
0,60,8450,7,5,2003,196.0,150,0,1710,1,...,TA,No,Unf,Y,Gd,Typ,TA,Y,,Normal
1,20,9600,6,8,1976,0.0,284,0,1262,0,...,TA,Gd,Unf,Y,TA,Typ,TA,Y,,Normal
2,60,11250,7,5,2001,162.0,434,0,1786,1,...,TA,Mn,Unf,Y,Gd,Typ,TA,Y,,Normal
3,70,9550,7,5,1915,0.0,540,0,1717,1,...,Gd,No,Unf,Y,Gd,Typ,TA,Y,,Abnorml
4,60,14260,8,5,2000,350.0,490,0,2198,1,...,TA,Av,Unf,Y,Gd,Typ,TA,Y,,Normal


In [24]:
X_sels.shape

(1460, 40)

### Step 5: Final Pipeline with One Hot Encoder 

In [25]:
cat_sel = list(X_sels.columns[X_sels.dtypes=='object'])
con_sel = list(X_sels.columns[X_sels.dtypes!='object'])

In [26]:
cat_sel

['MSZoning',
 'Street',
 'LandContour',
 'BldgType',
 'HouseStyle',
 'RoofMatl',
 'Exterior1st',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType2',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'GarageCond',
 'PavedDrive',
 'MiscFeature',
 'SaleCondition']

In [27]:
con_sel

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'MasVnrArea',
 'BsmtUnfSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'EnclosedPorch',
 'ScreenPorch',
 'PoolArea',
 'YrSold']

In [28]:
from sklearn.preprocessing import OneHotEncoder

In [29]:
num_pipe2 = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                            ('scaler', StandardScaler())])
cat_pipe2 = Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                            ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

In [30]:
pre2 = ColumnTransformer([('num', num_pipe2, con_sel),
                          ('cat', cat_pipe2, cat_sel)]).set_output(transform='pandas')

In [31]:
X_sel_pre = pre2.fit_transform(X_sels)
X_sel_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtUnfSF,num__LowQualFinSF,num__GrLivArea,num__BsmtFullBath,...,cat__MiscFeature_Gar2,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
0,0.073375,-0.207142,0.651479,-0.5172,1.050994,0.511418,-0.944591,-0.120242,0.370333,1.10781,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,-0.57441,-0.641228,-0.120242,-0.482512,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,0.07348,0.651479,-0.5172,0.984752,0.32306,-0.301643,-0.120242,0.515013,1.10781,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.309859,-0.096897,0.651479,-0.5172,-1.863632,-0.57441,-0.06167,-0.120242,0.383659,1.10781,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.073375,0.375148,1.374795,-0.5172,0.951632,1.36457,-0.174865,-0.120242,1.299326,1.10781,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Step 6: Apply Train Test Split 

In [32]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_sel_pre, Y, test_size=0.2, random_state=24)

In [33]:
xtrain.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtUnfSF,num__LowQualFinSF,num__GrLivArea,num__BsmtFullBath,...,cat__MiscFeature_Gar2,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
307,-0.163109,-0.260259,-0.071836,1.280685,-1.698028,-0.57441,-0.561991,-0.120242,-0.208383,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
477,0.073375,0.318322,2.09811,-0.5172,1.150356,3.702426,3.590001,-0.120242,2.14646,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1203,-0.872563,-0.076853,0.651479,-0.5172,0.951632,0.37292,2.405982,-0.120242,0.21804,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
332,-0.872563,0.013848,1.374795,-0.5172,1.050994,1.065413,2.344856,-0.120242,0.216136,1.10781,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1424,-0.872563,-0.101608,-0.795151,-0.5172,-0.43944,-0.57441,-0.847243,-0.120242,-0.326411,1.10781,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [34]:
ytrain.head()

Unnamed: 0,SalePrice
307,89500
477,380000
1203,213000
332,284000
1424,144000


In [35]:
xtest.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtUnfSF,num__LowQualFinSF,num__GrLivArea,num__BsmtFullBath,...,cat__MiscFeature_Gar2,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
1000,-0.872563,-0.031152,-2.241782,-2.315085,-0.638164,-0.57441,-1.284176,-0.120242,-1.08788,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
314,0.309859,-0.091886,0.651479,1.280685,-1.532424,-0.57441,0.327721,-0.120242,0.23898,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
518,0.073375,-0.098802,-0.071836,-0.5172,0.88539,-0.57441,-1.084952,-0.120242,0.534049,1.10781,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1363,0.073375,-0.202231,-0.071836,-0.5172,1.150356,-0.57441,0.110387,-0.120242,-0.196961,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
537,-0.872563,0.22231,-1.518467,-0.5172,0.024251,-0.57441,-0.686506,-0.120242,-1.240174,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [36]:
xtrain.shape

(1168, 133)

In [37]:
xtest.shape

(292, 133)

In [38]:
model = LinearRegression()
model.fit(xtrain, ytrain)

### Step 7: Evaluate the model 

In [39]:
model.score(xtrain, ytrain)

0.8880553082393154

In [40]:
model.score(xtest, ytest)

-2.462717064590638e+18

In [41]:
ypred_train = model.predict(xtrain)
ypred_test = model.predict(xtest)

In [42]:
ytrain.head()

Unnamed: 0,SalePrice
307,89500
477,380000
1203,213000
332,284000
1424,144000


In [43]:
ypred_train[0:5]

array([[126226.],
       [366256.],
       [208768.],
       [239256.],
       [138500.]])

In [44]:
ytest.head()

Unnamed: 0,SalePrice
1000,82000
314,178000
518,211000
1363,156932
537,111250


In [45]:
ypred_test[0:5]

array([[ 16762.],
       [191564.],
       [197640.],
       [176926.],
       [112784.]])

In [46]:
from sklearn.metrics import mean_squared_error, mean_absolute_error,  mean_absolute_percentage_error, r2_score, root_mean_squared_error

In [47]:
def evaluate_model(model, x, y):
    ypred = model.predict(x)
    mse = mean_squared_error(y, ypred)
    rmse = mse**(1/2)
    mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)
    print(f"Mean Squared Error : {mse:.2f}")
    print(f"Root Mean Squared Error : {rmse:.2f}")
    print(f"Mean Absolute Error : {mae:.2f}")
    print(f"Mean Absolute Percentage Error : {mape:.4f}")
    print(f"R2 Score : {r2:.4f}")


In [48]:
evaluate_model(model, xtrain, ytrain)

Mean Squared Error : 672036729.33
Root Mean Squared Error : 25923.67
Mean Absolute Error : 16383.46
Mean Absolute Percentage Error : 0.0955
R2 Score : 0.8881


In [49]:
evaluate_model(model, xtest, ytest)

Mean Squared Error : 18460950520402752785283547136.00
Root Mean Squared Error : 135871080515328.03
Mean Absolute Error : 13099225830706.88
Mean Absolute Percentage Error : 64660568.5648
R2 Score : -2462717064590638080.0000


### Step 8: Perform out of sample prediction 

In [50]:
xnew = pd.read_csv('sample_set.csv', na_values=['','NA'], keep_default_na=False)
xnew

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [51]:
xnew_pre = pre2.transform(xnew)
xnew_pre

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtUnfSF,num__LowQualFinSF,num__GrLivArea,num__BsmtFullBath,...,cat__MiscFeature_Gar2,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
0,-0.872563,0.110763,-0.795151,0.381743,-0.340077,-0.574410,-0.672923,-0.120242,-1.179256,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,0.375850,-0.071836,0.381743,-0.439440,0.023903,-0.365032,-0.120242,-0.354966,-0.819964,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,0.332053,-0.795151,-0.517200,0.852269,-0.574410,-0.974021,-0.120242,0.216136,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.073375,-0.054002,-0.071836,0.381743,0.885390,-0.463612,-0.550672,-0.120242,0.168544,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.492282,-0.552407,1.374795,-0.517200,0.686666,-0.574410,1.018211,-0.120242,-0.448246,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.438219,-0.859988,-1.518467,1.280685,-0.041991,-0.574410,-0.048086,-0.120242,-0.806136,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1455,2.438219,-0.864197,-1.518467,-0.517200,-0.041991,-0.574410,-0.618589,-0.120242,-0.806136,-0.819964,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1456,-0.872563,0.950423,-0.795151,1.280685,-0.373198,-0.574410,-1.284176,-0.120242,-0.554851,1.107810,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1457,0.664586,-0.007600,-0.795151,-0.517200,0.686666,-0.574410,0.017567,-0.120242,-1.038384,-0.819964,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [52]:
preds = model.predict(xnew_pre)
preds

array([[110668.],
       [148442.],
       [174992.],
       ...,
       [167878.],
       [118044.],
       [221100.]])

In [53]:
df_final = xnew[['Id', 'YrSold', 'SaleCondition']]
df_final

Unnamed: 0,Id,YrSold,SaleCondition
0,1461,2010,Normal
1,1462,2010,Normal
2,1463,2010,Normal
3,1464,2010,Normal
4,1465,2010,Normal
...,...,...,...
1454,2915,2006,Normal
1455,2916,2006,Abnorml
1456,2917,2006,Abnorml
1457,2918,2006,Normal


In [54]:
df_final['Sale_Prediction'] = preds
df_final

Unnamed: 0,Id,YrSold,SaleCondition,Sale_Prediction
0,1461,2010,Normal,110668.0
1,1462,2010,Normal,148442.0
2,1463,2010,Normal,174992.0
3,1464,2010,Normal,179204.0
4,1465,2010,Normal,192472.0
...,...,...,...,...
1454,2915,2006,Normal,67752.0
1455,2916,2006,Abnorml,53638.0
1456,2917,2006,Abnorml,167878.0
1457,2918,2006,Normal,118044.0


In [55]:
df_final.to_csv("Backward_Elimination.csv", index=False)