In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [7]:
data_house_prices = pd.read_csv("data_housing_prices.csv", sep=";")

In [39]:
data_house_prices.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1,1.0,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,2.0,2008.0,WD,Normal,208500.0
3,2.0,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,...,0.0,,,,0.0,5.0,2007.0,WD,Normal,181500.0
5,3.0,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,9.0,2008.0,WD,Normal,223500.0
7,4.0,70.0,RL,60.0,9550.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,2.0,2006.0,WD,Abnorml,140000.0
9,5.0,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,...,0.0,,,,0.0,12.0,2008.0,WD,Normal,250000.0


In [40]:
data_house_prices.shape

(1460, 81)

In [41]:
data_house_prices.drop(data_house_prices.index[:1], inplace=True)

### Defining X, y

In [42]:
X = data_house_prices.drop(columns=['Id','SalePrice'], axis = 1)
y = np.log(data_house_prices['SalePrice'])

### Data Splitting

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [23]:
X_train.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1022.0,832.0,1022.0,1022.0,1022.0,1022.0,1022.0,1019.0,1022.0,1022.0,...,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0
mean,57.059687,70.375,10745.437378,6.12818,5.564579,1970.995108,1984.757339,105.26104,446.176125,42.368885,...,477.120352,97.548924,49.52544,21.303327,3.881605,15.565558,2.74364,41.565558,6.363992,2007.81409
std,42.669715,25.533607,11329.753423,1.371391,1.110557,30.748816,20.747109,172.707705,459.971174,151.210531,...,208.443296,131.691873,69.205968,61.53438,32.547722,56.445749,39.671531,338.132358,2.650115,1.325807
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,59.0,7564.25,5.0,5.0,1953.0,1966.0,0.0,0.0,0.0,...,350.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,50.0,70.0,9600.0,6.0,5.0,1972.0,1994.0,0.0,390.0,0.0,...,484.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,80.0,11692.5,7.0,6.0,2001.0,2004.0,170.0,724.0,0.0,...,576.0,170.5,73.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1378.0,5644.0,1127.0,...,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,8300.0,12.0,2010.0


### Variance Threshold Method

In [24]:
from sklearn.feature_selection import VarianceThreshold # It only works with numerical features

In [45]:
X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

In [46]:
#display(X_train)
print("Initial number of numerical columns: ",X_train.shape)
print()

Initial number of numerical columns:  (1021, 36)



In [47]:
selector = VarianceThreshold(threshold=100) 
### Default threshold value is 0
### Features with a training-set variance lower than this threshold will be removed.
selector.fit(X_train)

VarianceThreshold(threshold=100)

In [48]:
### returns an array of integers corresponding to nonremoved features
kept_features_indexes = selector.get_support(indices = True) 
kept_features = list(X_train.iloc[:,kept_features_indexes].columns)

In [49]:
X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

In [50]:
X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

In [51]:
print("Final number of numerical columns: ",X_train.shape)
print()
X_train

Final number of numerical columns:  (1021, 23)



Unnamed: 0,MSSubClass,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,GrLivArea,GarageYrBlt,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
0,20.0,,10355.0,1967.0,1967.0,196.0,695.0,0.0,519.0,1214.0,...,1214.0,1967.0,318.0,0.0,111.0,0.0,0.0,0.0,0.0,0.0
1,180.0,35.0,3675.0,2005.0,2005.0,80.0,547.0,0.0,0.0,547.0,...,1072.0,2005.0,525.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0
2,20.0,57.0,9245.0,1994.0,1995.0,0.0,686.0,0.0,304.0,990.0,...,990.0,1996.0,672.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,60.0,42.0,26178.0,1989.0,1990.0,293.0,965.0,0.0,245.0,1210.0,...,2519.0,1989.0,628.0,320.0,27.0,0.0,0.0,0.0,0.0,0.0
4,50.0,40.0,4400.0,1920.0,1950.0,0.0,0.0,0.0,648.0,648.0,...,1118.0,1990.0,440.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1016,70.0,60.0,6882.0,1914.0,2006.0,0.0,0.0,0.0,684.0,684.0,...,1355.0,,0.0,136.0,0.0,115.0,0.0,0.0,0.0,0.0
1017,20.0,63.0,10712.0,1991.0,1992.0,0.0,212.0,0.0,762.0,974.0,...,974.0,,0.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0
1018,20.0,70.0,8400.0,1968.0,1968.0,168.0,1016.0,0.0,36.0,1052.0,...,1052.0,1968.0,288.0,356.0,0.0,0.0,0.0,0.0,0.0,0.0
1019,190.0,75.0,11625.0,1965.0,1965.0,0.0,841.0,0.0,198.0,1039.0,...,1039.0,1965.0,504.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Correlation Matrix

In [52]:
import seaborn as sns
import matplotlib.pyplot as plt

In [54]:
c = abs(data_house_prices.corr())

In [56]:
fig, ax = plt.subplots(figsize=(14,14))
sns.heatmap(c, annot=True)

<AxesSubplot:>

In [55]:
c_last = c['SalePrice'].sort_values(ascending=False)
c_thr = .6
columns_to_keep = list(c_last[c_last > c_thr].index)[1:] + [list(c_last[c_last > c_thr].index)[0]]
print(columns_to_keep)

data_house_prices[columns_to_keep]

['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'SalePrice']


Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,SalePrice
3,6.0,1262.0,2.0,460.0,1262.0,1262.0,181500.0
5,7.0,1786.0,2.0,608.0,920.0,920.0,223500.0
7,7.0,1717.0,3.0,642.0,756.0,961.0,140000.0
9,8.0,2198.0,3.0,836.0,1145.0,1145.0,250000.0
11,5.0,1362.0,2.0,480.0,796.0,796.0,143000.0
...,...,...,...,...,...,...,...
2911,6.0,1647.0,2.0,460.0,953.0,953.0,175000.0
2913,6.0,2073.0,2.0,500.0,1542.0,2073.0,210000.0
2915,7.0,2340.0,1.0,252.0,1152.0,1188.0,266500.0
2917,5.0,1078.0,1.0,240.0,1078.0,1078.0,142125.0


### Recursive feature elimination

In [60]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

#X_train.isna().sum()
nulls = pd.DataFrame(X_train.isna().sum()).reset_index()
#nulls.head()
nulls.columns = ['Column','nas']
#nulls.head()
#nulls[nulls['nas'] > 0].head()
cols_to_drop = nulls[nulls['nas'] > 0]['Column'] 
### Too drastic, but made on pourpose for quick filtering (don't do this in production!!)

X_train.drop(columns=cols_to_drop, axis=1, inplace = True)
X_test.drop(columns=cols_to_drop, axis=1, inplace = True)

#display(X_train)

lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 8, step = 1, verbose = 1) 
### Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) 
###returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)

Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Final selected features: 


Unnamed: 0,OverallQual,BsmtFullBath,BsmtHalfBath,FullBath,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars
0,5.0,0.0,0.0,2.0,1.0,5.0,1.0,1.0
1,5.0,1.0,0.0,1.0,1.0,5.0,0.0,2.0
2,5.0,0.0,1.0,1.0,1.0,5.0,0.0,2.0
3,7.0,1.0,0.0,2.0,1.0,9.0,2.0,2.0
4,6.0,0.0,0.0,1.0,1.0,6.0,0.0,2.0
...,...,...,...,...,...,...,...,...
1016,6.0,0.0,0.0,1.0,1.0,7.0,0.0,0.0
1017,5.0,0.0,0.0,1.0,1.0,5.0,0.0,0.0
1018,5.0,1.0,0.0,1.0,1.0,5.0,0.0,1.0
1019,5.0,1.0,0.0,1.0,1.0,6.0,0.0,2.0


### Embedded Methods

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

In [67]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression 

### model=Lasso() 
### model.fit(X_train, y_train) 