In [38]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold, f_regression, SelectKBest
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

In [18]:
df_numeric = pd.read_csv('data/df_numeric.csv')
df_numeric.shape

(1458, 60)

In [19]:
y = df_numeric.SalePrice
df_numeric.drop("SalePrice", axis=1,inplace=True)

## Removing Features With Small Variance

In [20]:
vt = VarianceThreshold(0.1)
df_transformed = vt.fit_transform(df_numeric)

In [21]:
df_transformed.shape

(1458, 50)

In [22]:
# columns we have selected
# get_support() is method of VarianceThreshold and stores boolean of each variable in the numpy array.
selected_columns = df_numeric.columns[vt.get_support()]
# transforming an array back to a data-frame preserves column labels
df_transformed = pd.DataFrame(df_transformed, columns = selected_columns)

## Removing Correlated Features

The goal of this part is to remove one feature from each highly correlated pair.

We are going to do this in 3 steps:

1 Calculate a correlation matrix  
2 Get pairs of highly correlated features  
3 Remove correlated columns  

In [25]:
# step 1
df_corr = df_transformed.corr().abs()

# step 2
indices = np.where(df_corr > 0.8) 
indices = [(df_corr.index[x], df_corr.columns[y]) 
for x, y in zip(*indices)
    if x != y and x < y]

# step 3
for idx in indices: #each pair
    try:
        df_transformed.drop(idx[1], axis = 1, inplace=True)
    except KeyError:
        pass

In [26]:
print(indices)

[('TotalBsmtSF', '1stFlrSF'), ('GrLivArea', 'TotRmsAbvGrd'), ('GrLivArea', '1stFlr_2ndFlr_SF'), ('TotRmsAbvGrd', '1stFlr_2ndFlr_SF'), ('GarageCars', 'GarageArea'), ('GarageQual', 'GarageCond')]


In [27]:
df_transformed.shape

(1458, 45)

## Forward Regression

We have removed the features with no information and correlated features so far. The last thing we will do before modeling is to select the k-best features in terms of the relationship with the target variable. We will use the forward wrapper method for that:

In [29]:
skb = SelectKBest(f_regression, k=10)
X = skb.fit_transform(df_transformed, y)

In [30]:
# this will give us the position of top 10 columns
skb.get_support()
# column names
df_transformed.columns[skb.get_support()]
X = pd.DataFrame(X,columns=df_transformed.columns[skb.get_support()])

(1458, 10)

# Linear Regression

## statsmodels

We have to add an intercept to our predictive dataset to also estimate the intercept. If we don't do that the intercept will be considered 0.

In [34]:
X = sm.add_constant(X)

In [35]:
lin_reg = sm.OLS(y,X)

In [36]:
type(lin_reg)

statsmodels.regression.linear_model.OLS

In [37]:
model = lin_reg.fit()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.835
Model:                            OLS   Adj. R-squared:                  0.834
Method:                 Least Squares   F-statistic:                     732.0
Date:                Wed, 17 Nov 2021   Prob (F-statistic):               0.00
Time:                        20:34:27   Log-Likelihood:                -17206.
No. Observations:                1458   AIC:                         3.443e+04
Df Residuals:                    1447   BIC:                         3.449e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const        -8.992e+05   8.93e+04    -10.069   

## sklearn

In [39]:
regressor = LinearRegression()
regressor.fit(X,y)

LinearRegression()

 In sklearn, we don't have to add a constant to a dataset. We have to set this parameter to the value True if we want to compute an intercept as well.

In [41]:
print(regressor.coef_)

[     0.           5507.54189138    392.2863556   14466.78601472
    920.78618122     42.13854481     66.85496149 -11218.59562134
  11469.89475761   9314.43585305   1078.19597724]


In [42]:
regressor.score(X,y)

0.83494920713919