---

<center> <h1> House Prices - Advanced Regression Techniques </h1> </center>
<br>

---


In [90]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from scipy.stats import norm, skew
from scipy.special import boxcox1p
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

---
##### Data Wrangling, Exploration and EDA

In [91]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [92]:
df_train["SalePrice"] = np.log1p(df_train["SalePrice"])

In [95]:
df_train["SalePrice"]

0       12.247699
1       12.109016
2       12.317171
3       11.849405
4       12.429220
          ...    
1455    12.072547
1456    12.254868
1457    12.493133
1458    11.864469
1459    11.901590
Name: SalePrice, Length: 1456, dtype: float64

In [93]:
df_train.drop(df_train[(df_train['GrLivArea']>4000) & (df_train['SalePrice']<300000)].index, inplace=True)

In [85]:
target = df_train.SalePrice.copy()

In [14]:
df = pd.concat((df_train, df_test)).reset_index(drop=True)
df.drop(['SalePrice'], axis=1, inplace=True)

all_data size is : (2915, 80)


In [16]:
for col in ["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu"] :
    df[col] = df[col].fillna("No")

In [17]:
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    df[col] = df[col].fillna('No')

In [18]:
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars']:
    df[col] = df[col].fillna(0)

In [19]:
for col in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']:
    df[col] = df[col].fillna(0)

In [20]:
for col in ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']:
    df[col] = df[col].fillna('No')

In [21]:
df["MasVnrType"] = df["MasVnrType"].fillna("None")
df["MasVnrArea"] = df["MasVnrArea"].fillna(0)

In [23]:
df['MSZoning'] = df['MSZoning'].fillna(df['MSZoning'].mode()[0])

In [24]:
df["Functional"] = df["Functional"].fillna("Typ")

In [25]:
df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

In [27]:
df['KitchenQual'] = df['KitchenQual'].fillna(df['KitchenQual'].mode()[0])

In [28]:
df['MSSubClass'] = df['MSSubClass'].fillna("No")
df['Exterior1st'] = df['Exterior1st'].fillna(df['Exterior1st'].mode()[0])
df['Exterior2nd'] = df['Exterior2nd'].fillna(df['Exterior2nd'].mode()[0])
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])

In [29]:
df.drop(["Utilities"], axis=1, inplace=True)

In [30]:
for col in ["MSSubClass", "OverallCond", "YrSold", "MoSold"]:
    df[col] = df[col].apply(str)

In [31]:
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')

for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(df[c].values)) 
    df[c] = lbl.transform(list(df[c].values))

In [32]:
numeric_feats = df.dtypes[df.dtypes != "object"].index
skewed_feats = df[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)
skewed_features = skewness.index
lam = 0.15
for col in skewed_features:
    df[col] = boxcox1p(df[col], lam)

Unnamed: 0,Skew
MiscVal,21.932147
PoolArea,18.701829
LotArea,13.123758
LowQualFinSF,12.080315
3SsnPorch,11.368094
LandSlope,4.97135
KitchenAbvGr,4.298845
BsmtFinSF2,4.142863
EnclosedPorch,4.000796
ScreenPorch,3.943508


In [34]:
df.drop(["Id"], axis=1, inplace=True)

In [35]:
df = pd.get_dummies(df)

In [36]:
train = df[:df_train.shape[0]]
test = df[df_train.shape[0]:]

In [37]:
num_vars = train.select_dtypes(include=['int64','float64']).columns

In [38]:
scaler = StandardScaler()
train[num_vars] = scaler.fit_transform(train[num_vars])
test[num_vars] = scaler.transform(test[num_vars])

---

### Training Models:

In [96]:
target

0       12.247699
1       12.109016
2       12.317171
3       11.849405
4       12.429220
          ...    
1455    12.072547
1456    12.254868
1457    12.493133
1458    11.864469
1459    11.901590
Name: SalePrice, Length: 1456, dtype: float64

In [39]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, random_state=42)

---
### Trying  Ridge Regression:

In [40]:
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
                 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
                 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100]
         }


ridge = Ridge()

folds = 5
model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_squared_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)    

model_cv.fit(X_train, y_train) 

Fitting 5 folds for each of 26 candidates, totalling 130 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 130 out of 130 | elapsed:    2.0s finished


GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3,
                                   0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0,
                                   4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50,
                                   100]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=1)

In [41]:
ridge = model_cv.best_estimator_

In [42]:
y_train_pred_ridge = ridge.predict(X_train)
print(r2_score(y_true=y_train, y_pred=y_train_pred_ridge))

0.9373187358598052


In [43]:
y_test_pred_ridge = ridge.predict(X_test)
print(r2_score(y_true=y_test, y_pred=y_test_pred_ridge))

0.9071329515522366


In [44]:
print ('RMSE Validation is: \n', mean_squared_error(y_test, y_test_pred_ridge))

RMSE Validation is: 
 0.014667279857542213


---

### Trying Lasso Regression

In [45]:
params = {'alpha': [0.00005, 0.0001, 0.001, 0.008, 0.01]}
lasso = Lasso()

model_cv_l = GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'neg_mean_squared_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            

model_cv_l.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.8s finished


GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': [5e-05, 0.0001, 0.001, 0.008, 0.01]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=1)

In [46]:
lasso = model_cv_l.best_estimator_

In [47]:
y_train_pred_lasso = lasso.predict(X_train)
print(r2_score(y_true=y_train, y_pred=y_train_pred_lasso))

0.9314932638688753


In [48]:
y_test_pred_lasso = lasso.predict(X_test)
print(r2_score(y_true=y_test, y_pred=y_test_pred_lasso))

0.9053962604610747


In [49]:
print ('RMSE Validation is: \n', mean_squared_error(y_test, y_test_pred_lasso))

RMSE Validation is: 
 0.014941570197182978


----

### Preparing submission:

In [101]:
test.columns[40:60]

Index(['GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscVal', 'MoSold', 'YrSold', 'MSZoning_C (all)',
       'MSZoning_FV'],
      dtype='object')

In [50]:
preds = np.exp(ridge.predict(test))

In [51]:
predictions = pd.DataFrame({'Id': df_test['Id'] ,'SalePrice': preds })

In [52]:
predictions.to_csv("preds.csv",index=False)

In [53]:
predictions.SalePrice

0       116228.506408
1       155733.849045
2       181324.042705
3       192517.325108
4       190389.584537
            ...      
1454     85492.716876
1455     81710.933270
1456    169910.576369
1457    121431.552289
1458    223529.422606
Name: SalePrice, Length: 1459, dtype: float64