Goal is to predict the sales price for each house. For each Id in the test set, need to predict the value of the SalePrice variable

Steps to perform on Training Set

1. Read Dataset
2. Preview of Dataset and features
3. cols_to_drop from B with less statistical importance
4. Missing data Treatment
5. Skew if any
6. Finding important features for model and specify Y
7. Preprocessing
8. Train test split and preparing Model for prediction


### Read Data and FilterWarnings

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
import pandas as pd
A = pd.read_csv("Desktop/DataSets/Housing project/training_set.csv")

pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",100)

### Preview of Dataset

In [3]:
A.shape

(1460, 81)

In [4]:
A.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
A.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

### Dropping Column with less statistical importance

In [6]:
A.nunique()

Id               1460
MSSubClass         15
MSZoning            5
LotFrontage       110
LotArea          1073
Street              2
Alley               2
LotShape            4
LandContour         4
Utilities           2
LotConfig           5
LandSlope           3
Neighborhood       25
Condition1          9
Condition2          8
BldgType            5
HouseStyle          8
OverallQual        10
OverallCond         9
YearBuilt         112
YearRemodAdd       61
RoofStyle           6
RoofMatl            8
Exterior1st        15
Exterior2nd        16
MasVnrType          4
MasVnrArea        327
ExterQual           4
ExterCond           5
Foundation          6
BsmtQual            4
BsmtCond            4
BsmtExposure        4
BsmtFinType1        6
BsmtFinSF1        637
BsmtFinType2        6
BsmtFinSF2        144
BsmtUnfSF         780
TotalBsmtSF       721
Heating             6
HeatingQC           5
CentralAir          2
Electrical          5
1stFlrSF          753
2ndFlrSF          417
LowQualFin

In [7]:
A = A.drop(labels="Id",axis=1)

### Missing Data Treatment

In [8]:
A.isna().sum()

MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
GrLivArea 

In [9]:
A.Alley = A.Alley.fillna("No alley access")
A.BsmtQual = A.BsmtQual.fillna("No Basement")
A.BsmtCond = A.BsmtCond.fillna("No Basement")
A.BsmtExposure = A.BsmtExposure.fillna("No Basement")
A.BsmtFinType1 = A.BsmtFinType1.fillna("No Basement")
A.BsmtFinType2 = A.BsmtFinType2.fillna("No Basement")
A.FireplaceQu = A.FireplaceQu.fillna("No Fireplace")
A.GarageType = A.GarageType.fillna("No Garage")
A.GarageFinish = A.GarageFinish.fillna("No Garage")
A.GarageQual = A.GarageQual.fillna("No Garage")
A.GarageCond = A.GarageCond.fillna("No Garage")
A.PoolQC = A.PoolQC.fillna("No Pool")
A.Fence = A.Fence.fillna("No Fence")
A.MiscFeature = A.MiscFeature.fillna("None")

In [10]:
A.isna().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
Alley              0
LotShape           0
LandContour        0
Utilities          0
LotConfig          0
LandSlope          0
Neighborhood       0
Condition1         0
Condition2         0
BldgType           0
HouseStyle         0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
RoofStyle          0
RoofMatl           0
Exterior1st        0
Exterior2nd        0
MasVnrType         8
MasVnrArea         8
ExterQual          0
ExterCond          0
Foundation         0
BsmtQual           0
BsmtCond           0
BsmtExposure       0
BsmtFinType1       0
BsmtFinSF1         0
BsmtFinType2       0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
Heating            0
HeatingQC          0
CentralAir         0
Electrical         1
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath 

In [11]:
for i in A:
    if A[i].dtypes == "object":
        x = A[i].mode()[0]
        A[i] = A[i].fillna(x)
    else:
        x = A[i].mean()
        A[i] = A[i].fillna(x)

In [12]:
A.isna().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual 

### Skew

In [13]:
A.skew().sort_values()

GarageYrBlt      -0.668175
YearBuilt        -0.613461
YearRemodAdd     -0.503562
GarageCars       -0.342549
FullBath          0.036562
YrSold            0.096269
GarageArea        0.179981
BedroomAbvGr      0.211790
MoSold            0.212053
OverallQual       0.216944
BsmtFullBath      0.596067
Fireplaces        0.649565
HalfBath          0.675897
TotRmsAbvGrd      0.676341
OverallCond       0.693067
2ndFlrSF          0.813030
BsmtUnfSF         0.920268
GrLivArea         1.366560
1stFlrSF          1.376757
MSSubClass        1.407657
TotalBsmtSF       1.524255
WoodDeckSF        1.541376
BsmtFinSF1        1.685503
SalePrice         1.882876
OpenPorchSF       2.364342
LotFrontage       2.384950
MasVnrArea        2.676412
EnclosedPorch     3.089872
BsmtHalfBath      4.103403
ScreenPorch       4.122214
BsmtFinSF2        4.255261
KitchenAbvGr      4.488397
LowQualFinSF      9.011341
3SsnPorch        10.304342
LotArea          12.207688
PoolArea         14.828374
MiscVal          24.476794
d

In [14]:
P = A.skew()
cols_with_skew = list(P[(P>2)].index)
cols_with_skew

['LotFrontage',
 'LotArea',
 'MasVnrArea',
 'BsmtFinSF2',
 'LowQualFinSF',
 'BsmtHalfBath',
 'KitchenAbvGr',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal']

In [15]:
import numpy as np
#A[cols_with_skew].apply(np.log)  # this results infinity which cols contain 0 , coz log(0)= ∞

In [16]:
for j in cols_with_skew:
    
    W = []
    for i in A[j]:
        if (i != 0):
            W.append(np.log(i))
        else:
            W.append(i)
        
    A[j] = W

# Finding important features and Y

In [17]:
Y = A[["SalePrice"]]

In [18]:
cat = []
con = []

for i in A.columns:
    if A[i].dtypes == "object":
        cat.append(i)
    else:
        con.append(i)

SalePrice Vs CON ----> through Correlation

In [19]:
Q = A.corr()["SalePrice"].sort_values()
imp_con = list(Q[(Q<1) & (Q>0.3)].index)
imp_con

['2ndFlrSF',
 'WoodDeckSF',
 'LotFrontage',
 'BsmtFinSF1',
 'LotArea',
 'MasVnrArea',
 'OpenPorchSF',
 'Fireplaces',
 'GarageYrBlt',
 'YearRemodAdd',
 'YearBuilt',
 'TotRmsAbvGrd',
 'FullBath',
 '1stFlrSF',
 'TotalBsmtSF',
 'GarageArea',
 'GarageCars',
 'GrLivArea',
 'OverallQual']

SalePrice Vs CAT ----> Using ANOVA

In [20]:
def ANOVA(df,cat,con):
    from statsmodels.formula.api import ols
    relation = con + " ~ " + cat
    model = ols(relation,df).fit()
    from statsmodels.stats.anova import anova_lm
    anova_results = anova_lm(model)
    return round(anova_results.iloc[0,4],4)

imp_cat = []
Q = list(cat)
for i in Q:
    print("==============================")
    print("SalePrice vs",i)
    w = ANOVA(A,i,"SalePrice")
    print(w)
    
    if (w < 0.05):
        imp_cat.append(i)
        

SalePrice vs MSZoning
0.0
SalePrice vs Street
0.117
SalePrice vs Alley
0.0
SalePrice vs LotShape
0.0
SalePrice vs LandContour
0.0
SalePrice vs Utilities
0.5847
SalePrice vs LotConfig
0.0
SalePrice vs LandSlope
0.1414
SalePrice vs Neighborhood
0.0
SalePrice vs Condition1
0.0
SalePrice vs Condition2
0.0434
SalePrice vs BldgType
0.0
SalePrice vs HouseStyle
0.0
SalePrice vs RoofStyle
0.0
SalePrice vs RoofMatl
0.0
SalePrice vs Exterior1st
0.0
SalePrice vs Exterior2nd
0.0
SalePrice vs MasVnrType
0.0
SalePrice vs ExterQual
0.0
SalePrice vs ExterCond
0.0
SalePrice vs Foundation
0.0
SalePrice vs BsmtQual
0.0
SalePrice vs BsmtCond
0.0
SalePrice vs BsmtExposure
0.0
SalePrice vs BsmtFinType1
0.0
SalePrice vs BsmtFinType2
0.0
SalePrice vs Heating
0.0008
SalePrice vs HeatingQC
0.0
SalePrice vs CentralAir
0.0
SalePrice vs Electrical
0.0
SalePrice vs KitchenQual
0.0
SalePrice vs Functional
0.0005
SalePrice vs FireplaceQu
0.0
SalePrice vs GarageType
0.0
SalePrice vs GarageFinish
0.0
SalePrice vs Garage

In [21]:
imp_cat

['MSZoning',
 'Alley',
 'LotShape',
 'LandContour',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

### Preprocessing

In [22]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X1 = pd.DataFrame(ss.fit_transform(A[imp_con]),columns=imp_con)

In [23]:
X2 = pd.get_dummies(A[imp_cat])

### FInal features to be used for Model

In [24]:
X = X2.join(X1)

In [25]:
X.head(3)

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Alley_Grvl,Alley_No alley access,Alley_Pave,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,...,PavedDrive_P,PavedDrive_Y,PoolQC_Ex,PoolQC_Fa,PoolQC_Gd,PoolQC_No Pool,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_No Fence,MiscFeature_Gar2,MiscFeature_None,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,2ndFlrSF,WoodDeckSF,LotFrontage,BsmtFinSF1,LotArea,MasVnrArea,OpenPorchSF,Fireplaces,GarageYrBlt,YearRemodAdd,YearBuilt,TotRmsAbvGrd,FullBath,1stFlrSF,TotalBsmtSF,GarageArea,GarageCars,GrLivArea,OverallQual
0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1.161852,-0.752176,-0.084142,0.575425,-0.133231,1.193663,0.845412,-0.951226,1.021157,0.878668,1.050994,0.91221,0.789741,-0.793434,-0.459303,0.351,0.311725,0.370333,0.651479
1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,-0.795163,1.626195,0.563447,1.171992,0.113442,-0.815039,-1.071354,0.600495,-0.104483,-0.429577,0.156734,-0.318683,0.789741,0.25714,0.466465,-0.060731,0.311725,-0.482512,-0.071836
2,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1.189351,-0.752176,0.05658,0.092907,0.420061,1.121157,0.671399,0.600495,0.937776,0.830215,0.984752,-0.318683,0.789741,-0.627826,-0.313369,0.631726,0.311725,0.515013,0.651479


In [26]:
X.shape

(1460, 278)

In [27]:
X.columns

Index(['MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL',
       'MSZoning_RM', 'Alley_Grvl', 'Alley_No alley access', 'Alley_Pave',
       'LotShape_IR1', 'LotShape_IR2',
       ...
       'YearRemodAdd', 'YearBuilt', 'TotRmsAbvGrd', 'FullBath', '1stFlrSF',
       'TotalBsmtSF', 'GarageArea', 'GarageCars', 'GrLivArea', 'OverallQual'],
      dtype='object', length=278)

### Spitting data into train test and Create a OLS Model

In [28]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.2,random_state=21)

In [29]:
from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.rsquared_adj

0.921609346267031

In [30]:
p = model.pvalues
q = model.pvalues.max()
col_to_drop = list(p[p == q].index)[0]

In [47]:
col_to_drop = list(model.pvalues.sort_values().index)[-2]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  ExterCond_Gd


0.921609346267031

In [48]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  SaleType_Oth


0.921609346267031

In [49]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  Exterior2nd_CBlock


0.921609346267031

In [50]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  Exterior1st_CBlock


0.921609346267031

In [51]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  Condition2_RRAn


0.921609346267031

In [32]:
Xnew.columns

Index(['MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL',
       'MSZoning_RM', 'Alley_Grvl', 'Alley_No alley access', 'Alley_Pave',
       'LotShape_IR1', 'LotShape_IR2',
       ...
       'YearRemodAdd', 'YearBuilt', 'TotRmsAbvGrd', 'FullBath', '1stFlrSF',
       'TotalBsmtSF', 'GarageArea', 'GarageCars', 'GrLivArea', 'OverallQual'],
      dtype='object', length=277)

In [52]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  TotRmsAbvGrd


0.9216926993778252

In [54]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  Exterior1st_BrkComm


0.9217758765194194

In [55]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  Fence_MnWw


0.9217758765194194

In [56]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  BsmtQual_Fa


0.9217758765194194

In [57]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  LotShape_IR1


0.9217758765194194

In [59]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  HeatingQC_Ex


0.9217758765194194

In [60]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  FullBath


0.9218588005068327

In [61]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  Exterior2nd_AsphShn


0.9219414399389521

In [62]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  Exterior1st_HdBoard


0.9220238456246723

In [63]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  Condition2_Artery


0.9221060735361561

In [64]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  SaleType_Con


0.9221881034376556

In [65]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  FireplaceQu_Fa


0.9221881034376556

In [66]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  ExterCond_Fa


0.9222699437997004

In [68]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  RoofStyle_Mansard


0.9222699437997004

In [69]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  Exterior2nd_HdBoard


0.9223512220667232

In [70]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  GarageYrBlt


0.9224323262035865

In [71]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  Neighborhood_MeadowV


0.9224323262035865

In [72]:
model.pvalues.sort_values()

col_to_drop = list(model.pvalues.sort_values().index)[-1]

col_to_drop = list(model.pvalues.sort_values().index)[-1]
print("Dropped column: ",col_to_drop)
Xnew = Xnew.drop(labels=[col_to_drop],axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=21)

from statsmodels.api import OLS,add_constant
xconst = add_constant(xtrain)
ols = OLS(ytrain,xconst)
model = ols.fit()
model.summary()
model.rsquared_adj

Dropped column:  RoofStyle_Flat


0.9225126820853012

In [73]:
Xnew.shape

(1460, 254)

After running the code for several times we found the best rsquared_adj as 0.9225126820853012

In [74]:
Xnew.columns

Index(['MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL',
       'MSZoning_RM', 'Alley_Grvl', 'Alley_No alley access', 'Alley_Pave',
       'LotShape_IR2', 'LotShape_IR3',
       ...
       'OpenPorchSF', 'Fireplaces', 'YearRemodAdd', 'YearBuilt', '1stFlrSF',
       'TotalBsmtSF', 'GarageArea', 'GarageCars', 'GrLivArea', 'OverallQual'],
      dtype='object', length=254)

### Create Predictions on test data

In [75]:
A = pd.read_csv("Desktop/DataSets/Housing project/testing_set.csv")

In [76]:
A.isna().sum()

Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       227
LotArea             0
Street              0
Alley            1352
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         16
MasVnrArea         15
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           44
BsmtCond           45
BsmtExposure       44
BsmtFinType1       42
BsmtFinSF1          1
BsmtFinType2       42
BsmtFinSF2          1
BsmtUnfSF           1
TotalBsmtSF         1
Heating             0
HeatingQC           0
CentralAir          0
Electrical          0
1stFlrSF            0
2ndFlrSF            0
LowQualFin

In [77]:
A.Alley = A.Alley.fillna("No alley access")
A.BsmtQual = A.BsmtQual.fillna("No Basement")
A.BsmtCond = A.BsmtCond.fillna("No Basement")
A.BsmtExposure = A.BsmtExposure.fillna("No Basement")
A.BsmtFinType1 = A.BsmtFinType1.fillna("No Basement")
A.BsmtFinType2 = A.BsmtFinType2.fillna("No Basement")
A.FireplaceQu = A.FireplaceQu.fillna("No Fireplace")
A.GarageType = A.GarageType.fillna("No Garage")
A.GarageFinish = A.GarageFinish.fillna("No Garage")
A.GarageQual = A.GarageQual.fillna("No Garage")
A.GarageCond = A.GarageCond.fillna("No Garage")
A.PoolQC = A.PoolQC.fillna("No Pool")
A.Fence = A.Fence.fillna("No Fence")
A.MiscFeature = A.MiscFeature.fillna("None")

In [78]:
for i in A:
    if A[i].dtypes == "object":
        x = A[i].mode()[0]
        A[i] = A[i].fillna(x)
    else:
        x = A[i].mean()
        A[i] = A[i].fillna(x)

In [79]:
A.isna().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr

In [80]:
from pm6 import preprocessing
tsd = preprocessing(A)
tsd

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_No alley access,Alley_Pave,LotShape_IR1,LotShape_IR2,LotShape_IR3,...,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_No Garage,GarageFinish_Fin,GarageFinish_No Garage,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Fa,GarageQual_Gd,GarageQual_No Garage,GarageQual_Po,GarageQual_TA,GarageCond_Ex,GarageCond_Fa,GarageCond_Gd,GarageCond_No Garage,GarageCond_Po,GarageCond_TA,PavedDrive_N,PavedDrive_P,PavedDrive_Y,PoolQC_Ex,PoolQC_Gd,PoolQC_No Pool,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_No Fence,MiscFeature_Gar2,MiscFeature_None,MiscFeature_Othr,MiscFeature_Shed,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-1.730864,-0.874711,0.555587,0.363929,-0.751101,0.400766,-0.340945,-1.072885,-0.570108,0.063295,0.517348,-0.650619,-0.370808,-0.654561,-0.775254,-0.080483,-1.215588,-0.819568,-0.258526,-1.028720,-0.751040,-1.029543,-0.20391,-0.918335,-0.898055,-0.650488,-0.988013,1.185945,0.366678,-0.701628,-0.360738,-0.088827,1.818960,-0.057227,-0.092244,-0.038281,1.713905,0,0,1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,-1.728490,-0.874711,0.604239,0.897861,-0.054877,0.400766,-0.439695,-1.214908,0.041273,1.063392,-0.297903,-0.339378,0.639144,0.433298,-0.775254,-0.080483,-0.323539,-0.819568,-0.258526,-1.028720,1.237648,0.175997,-0.20391,-0.255371,-0.898055,-0.767194,-0.988013,-0.741213,2.347867,-0.178826,-0.360738,-0.088827,-0.301543,-0.057227,19.730438,-0.038281,1.713905,0,0,0,1,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,-1.726115,0.061351,0.263676,0.809646,-0.751101,-0.497418,0.844059,0.678742,-0.570108,0.773254,-0.297903,-0.954994,-0.266876,-0.574165,0.891944,-0.080483,0.294508,-0.819568,-0.258526,0.773083,1.237648,0.175997,-0.20391,-0.255371,0.647066,0.749983,0.301623,0.042559,0.930495,-0.207871,-0.360738,-0.088827,-0.301543,-0.057227,-0.092244,-1.140614,1.713905,0,0,0,1,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,-1.723741,0.061351,0.458284,0.032064,-0.054877,0.400766,0.876976,0.678742,-0.456889,0.357829,-0.297903,-0.527038,-0.271395,-0.579190,0.837243,-0.080483,0.243004,-0.819568,-0.258526,0.773083,1.237648,0.175997,-0.20391,0.407593,0.647066,0.788885,0.301623,-0.012766,2.089451,-0.178826,-0.360738,-0.088827,-0.301543,-0.057227,-0.092244,-0.038281,1.713905,0,0,0,1,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
4,-1.721367,1.465443,-1.244533,-0.971808,1.337571,-0.497418,0.679475,0.394694,-0.570108,-0.387298,-0.297903,1.058917,0.528434,0.310192,-0.775254,-0.080483,-0.424487,-0.819568,-0.258526,0.773083,-0.751040,-1.029543,-0.20391,-0.918335,-0.898055,0.555473,0.301623,0.153210,-0.729632,0.489198,-0.360738,-0.088827,2.243060,-0.057227,-0.092244,-1.875504,1.713905,0,0,0,1,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1.721367,2.401505,-2.314875,-1.591330,-1.447325,1.298950,-0.044694,-0.646813,-0.570108,-0.965376,-0.297903,-0.018983,-1.129968,-1.533893,0.523306,-0.080483,-0.811797,-0.819568,-0.258526,-1.028720,1.237648,0.175997,-0.20391,-0.918335,-0.898055,0.000000,-2.277648,-2.179665,-0.729632,-0.701628,-0.360738,-0.088827,-0.301543,-0.057227,-0.092244,-0.038281,-1.359958,0,0,0,0,1,0,1,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1455,1.723741,2.401505,-2.314875,-1.599808,-1.447325,-0.497418,-0.044694,-0.646813,-0.570108,-0.411477,-0.297903,-0.595694,-1.129968,-1.533893,0.523306,-0.080483,-0.811797,-0.819568,-0.258526,-1.028720,1.237648,0.175997,-0.20391,-0.255371,-0.898055,-0.300371,-0.988013,-0.861084,-0.729632,-0.353093,-0.360738,-0.088827,-0.301543,-0.057227,-0.092244,-0.773170,-1.359958,0,0,0,0,1,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
1456,1.726115,-0.874711,4.447740,2.055150,-0.751101,1.298950,-0.373861,0.584059,-0.570108,1.724994,-0.297903,-1.268524,0.401907,0.169499,-0.775254,-0.080483,-0.539856,1.066863,-0.258526,-1.028720,-0.751040,1.381537,-0.20391,0.407593,0.647066,-0.689390,0.301623,0.475939,2.982161,-0.701628,-0.360738,-0.088827,-0.301543,-0.057227,-0.092244,1.064053,-1.359958,0,0,0,1,0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
1457,1.728490,0.646389,-0.320147,0.125527,-0.751101,-0.497418,0.679475,0.394694,-0.570108,-0.224645,-0.297903,0.047384,-0.303026,-0.468645,-0.775254,-0.080483,-1.063136,-0.819568,3.706446,-1.028720,-0.751040,0.175997,-0.20391,-0.255371,-0.898055,0.000000,-2.277648,-2.179665,-0.103169,-0.236915,-0.360738,-0.088827,-0.301543,-0.057227,1.017827,0.329164,-1.359958,0,0,0,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [81]:
Q = list(Xnew)
#tsd[Xnew.columns]

for i in ['Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn', 'HouseStyle_2.5Fin', 'RoofMatl_ClyTile', 'RoofMatl_Membran', 'RoofMatl_Metal', 'RoofMatl_Roll', 'Exterior1st_ImStucc', 'Exterior1st_Stone', 'Exterior2nd_Other', 'Heating_Floor', 'Heating_OthW', 'Electrical_Mix', 'GarageQual_Ex', 'PoolQC_Fa', 'MiscFeature_TenC']:
    tsd[i] = 0

In [82]:
tsd[Xnew.columns]

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Alley_Grvl,Alley_No alley access,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,...,GarageCond_Fa,GarageCond_Gd,GarageCond_No Garage,GarageCond_Po,GarageCond_TA,PavedDrive_N,PavedDrive_P,PavedDrive_Y,PoolQC_Ex,PoolQC_Fa,PoolQC_Gd,PoolQC_No Pool,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_No Fence,MiscFeature_Gar2,MiscFeature_None,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,SaleType_COD,SaleType_CWD,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,2ndFlrSF,WoodDeckSF,LotFrontage,BsmtFinSF1,LotArea,MasVnrArea,OpenPorchSF,Fireplaces,YearRemodAdd,YearBuilt,1stFlrSF,TotalBsmtSF,GarageArea,GarageCars,GrLivArea,OverallQual
0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,-0.775254,0.366678,0.555587,0.063295,0.363929,-0.570108,-0.701628,-0.898055,-1.072885,-0.340945,-0.654561,-0.370808,1.185945,-0.988013,-1.215588,-0.751101
1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,-0.775254,2.347867,0.604239,1.063392,0.897861,0.041273,-0.178826,-0.898055,-1.214908,-0.439695,0.433298,0.639144,-0.741213,-0.988013,-0.323539,-0.054877
2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0.891944,0.930495,0.263676,0.773254,0.809646,-0.570108,-0.207871,0.647066,0.678742,0.844059,-0.574165,-0.266876,0.042559,0.301623,0.294508,-0.751101
3,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0.837243,2.089451,0.458284,0.357829,0.032064,-0.456889,-0.178826,0.647066,0.678742,0.876976,-0.579190,-0.271395,-0.012766,0.301623,0.243004,-0.054877
4,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,-0.775254,-0.729632,-1.244533,-0.387298,-0.971808,-0.570108,0.489198,-0.898055,0.394694,0.679475,0.310192,0.528434,0.153210,0.301623,-0.424487,1.337571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0.523306,-0.729632,-2.314875,-0.965376,-1.591330,-0.570108,-0.701628,-0.898055,-0.646813,-0.044694,-1.533893,-1.129968,-2.179665,-2.277648,-0.811797,-1.447325
1455,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0.523306,-0.729632,-2.314875,-0.411477,-1.599808,-0.570108,-0.353093,-0.898055,-0.646813,-0.044694,-1.533893,-1.129968,-0.861084,-0.988013,-0.811797,-1.447325
1456,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,-0.775254,2.982161,4.447740,1.724994,2.055150,-0.570108,-0.701628,0.647066,0.584059,-0.373861,0.169499,0.401907,0.475939,0.301623,-0.539856,-0.751101
1457,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,-0.775254,-0.103169,-0.320147,-0.224645,0.125527,-0.570108,-0.236915,-0.898055,0.394694,0.679475,-0.468645,-0.303026,-2.179665,-2.277648,-1.063136,-0.751101


In [83]:
final = tsd[Q]

In [84]:
final.shape

(1459, 254)

In [85]:
Xnew.shape

(1460, 254)

In [86]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
model = lm.fit(Xnew,Y)
predicted = model.predict(final)

In [87]:
T = A[["Id"]]
T["SalePrice"] = predicted

In [88]:
T.to_csv("Desktop/sample_submission.csv",index=None)