# Kaggle Competition: House Prices: Advanced Regression Techniques
https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data
## Part 1: Machine Learning
### Outline:
1. Preparation
2. Testing and Selecting Base Models
3. Finetuning of the Best Model
4. Comparison with simple DLN

In [None]:
# Preparation: Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, Normalizer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor,GradientBoostingRegressor
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score, make_scorer
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from kaggle_scorer import rmsle, rmsle_validation
import keras
from keras.models import Sequential
from keras.layers import Dense,Activation,Flatten,Dropout
import tensorflow as tf

In [None]:
# Preparation: Import processed dataset
path = 'train_processed.csv'
df = pd.read_csv(path, index_col='Id')

df.head()

In [None]:
# Preparation: Split the dataset in train and test data
X = np.array(df.drop('SalePrice_log',axis=1))
y = np.array(df.loc[:,'SalePrice_log'])

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.80,shuffle=True)

In [None]:
# Preparation: Prepare Pipelines
poly_fit = PolynomialFeatures(degree=2)
encoder = OneHotEncoder(handle_unknown='ignore')
norm = Normalizer()
scaler = StandardScaler(with_mean=False)

lin_reg = LinearRegression()
rand_for = RandomForestRegressor()
ada_boost = AdaBoostRegressor()
grad_boost = GradientBoostingRegressor()

lin_pipe = Pipeline([('OneHotEncoder',encoder),
                     ('Normalizer',norm),
                     ('StandardScaler',scaler),
                     ('LinearRegression',lin_reg)])

rand_for_pipe = Pipeline([('OneHotEncoder',encoder),
                          ('Normalizer',norm),
                          ('StandardScaler',scaler),
                          ('RandomForest',rand_for)])

ada_boost_pipe = Pipeline([('OneHotEncoder',encoder),
                           ('Normalizer',norm),
                           ('StandardScaler',scaler),
                           ('AdaBoost',ada_boost)])

grad_boost_pipe = Pipeline([('OneHotEncoder',encoder),
                            ('Normalizer',norm),
                            ('StandardScaler',scaler),
                            ('GradientBoosting',grad_boost)])

In [None]:
# Testing and Selecting Base Models: Defining test function
def test_model(pipeline,X_train=X_train,X_test=X_test,y_train=y_train,y_test=y_test):
    
    start_time = dt.datetime.now()
    pipeline.fit(X_train,y_train)
    end_time = dt.datetime.now()
    fit_time = dt.timedelta.total_seconds(end_time-start_time)
    
    yhat_train = pipeline.predict(X_train)
    train_score = rmsle(y_train,yhat_train)
    
    yhat = pipeline.predict(X_test)
    test_score = rmsle(y_test,yhat)
    
    return np.array((fit_time,train_score,test_score))


In [None]:
# Testing and Selecting Base Models: Testing
pipe_list = [lin_pipe,rand_for_pipe,ada_boost_pipe,grad_boost_pipe]
results = np.empty((4,3))

test_map = map(test_model,pipe_list)
i = 0
for result in test_map:
    results[i]=result
    i+=1

In [None]:
# Testing and Selecting Base Models: Visualize Test Results
fig,ax = plt.subplots(figsize=(10,5))
names = ['Linear','RFR','ABR','GBR']
titles = ['Training Time','Training Accuracy','Test Accuracy']
y_titles = ['Seconds','RMSLE','RMSLE']
plot_info = zip(titles,y_titles)

for i,titles in enumerate(plot_info):
    plt.subplot(1,3,i+1)
    plt.bar(x=names,height=results[:,i])
    plt.title(titles[0])
    plt.ylabel(titles[1])
    
plt.tight_layout()
plt.show()

It is apparent, that the linear model tales the least training time, thanks to its simplicity. It's  And although it is performing well on the training and the test set, it is the least fitting one. Luckily, the second fastest model (GBR) is also the best model in terms of test accuracy, which is why, we will go with that model for now. 

In [None]:
# Finetuning of the best model: Define hyperparameters to be tuned
param_grid = {'GradientBoosting__learning_rate':[0.001,0.005,0.01,0.1],
              'GradientBoosting__n_estimators':[50,100,200],
              'GradientBoosting__max_depth':[1,3,5]}

grad_boost_pipe_cv = GridSearchCV(grad_boost_pipe,param_grid=param_grid,cv=3,n_jobs=-1)

In [None]:
# Finetuning of the best model: Fit final model
grad_boost_pipe_cv.fit(X_train,y_train)

final_results = test_model(grad_boost_pipe_cv)

In [None]:
# Finetuning of the best model: Display final results
print('Total Runtime: {}'.format(final_results[0]))
print('Training Score (RMSLE): {}'.format(final_results[1]))
print('Test Score (RMSLE) {}'.format(final_results[2]))

In [None]:
# Best model versus Deep Learning Network: Special pipeline
dln_pipe = Pipeline([('OneHotEncoder',encoder),
                     ('Normalizer',norm),
                     ('StandardScaler',scaler),
                     ('TruncatedSVD',TruncatedSVD(n_components=7))])

input_train = dln_pipe.fit_transform(X_train)
labels_train = y_train.reshape(-1,1)
input_test = dln_pipe.fit_transform(X_test)
labels_test = y_test.reshape(-1,1)

In [None]:
model = Sequential()
model.add(Dense(128,kernel_initializer='normal',input_dim=input_train.shape[1],activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256,kernel_initializer='normal',activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256,kernel_initializer='normal',activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256,kernel_initializer='normal',activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1,kernel_initializer='normal',activation='linear'))

model.compile(loss = 'mean_absolute_error',optimizer='adam',metrics=['mean_absolute_error'])

model.summary()

In [None]:
model.fit(input_train,labels_train,epochs=500,batch_size=32,validation_split=0.2,verbose=0,shuffle=True)

In [None]:
yhat = model.predict(input_test)
dln_score = rmsle(labels_test,yhat)

In [None]:
# TODO: Compare results of best untuned ML model, best tuned ML model and deep learning network.
print('Best untuned ML model: {}'.format(results[-1:-1]))
print('Best tuned ML model: {}'.format(final_results[-1]))
print('Best Deep Learning model: {}'.format(dln_score))

In [None]:
# TODO: Use test_processed.csv to make submission-ready predictions.