# Colab preparation
## Don't use it, if you don't need it on your own pc

In [None]:
!pip install fastai==0.7.0

In [None]:
!pip install keras

In [None]:
!pip install kaggle

In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [3]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

In [None]:
#Importing Libraries for data preparation
import pandas as pd
import numpy as np
from fastai.imports import *
from fastai.structured import *

In [None]:
# Import kaggle.json from google drive
# This snippet will output a link which needs authentication from any google account
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth
auth.authenticate_user()
drive_service = build('drive', 'v3')
results = drive_service.files().list(
    q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])
# print(kaggle_api_key)
filename = "/content/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)
request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

In [None]:

!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c bluebook-for-bulldozers

In [None]:
!ls

In [None]:
!unzip Train.zip

In [None]:
!ls

### Read Data

In [None]:
df_raw = pd.read_csv('Train.csv', low_memory=False, 
                     parse_dates=["saledate"])

### Look at the data

In [None]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [None]:
display_all(df_raw.tail().T)

In [None]:
from random import sample
?df_raw.sample

We take a random sample, of 100 000 records, to speed up the training process

In [None]:
df_raw = df_raw.sample(100000)

In [None]:
np.shape(df)

Remember from the Kaggle competition that we're interested in the Log of the SalePrice

In [None]:
df_raw.SalePrice = np.log(df_raw.SalePrice)

### Data Cleaning and preparation
Add extra date attributes to our dataset

In [None]:
add_datepart(df_raw, 'saledate')

Change Strings to categories

In [None]:
train_cats(df_raw)

Convert categories to their numerical values.

Handle missing values: 
- Take the median for missing continuous variables
- Add extra attributes for missing categorical variables

In [None]:
df, y, nas = proc_df(df_raw, 'SalePrice')

In [None]:
display_all(df.tail().T)

Creata a validation set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.30, random_state=42)

### Train Models
First, let's train our random forest

In [None]:
#import necessary libraries to build model
import random
from sklearn.ensemble import RandomForestRegressor
random.seed(42)
rf = RandomForestRegressor(n_estimators=10)
rf.fit(X_train, y_train)

rf.score(df,y)

Check how many attributes we have, to create our Neural Network

In [None]:
np.shape(df)

We have 66 attributes, so 66 is our input dimension

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation

# Define model
model = Sequential()
model.add(Dense(100, input_dim=66, activation= "relu"))
model.add(Dense(50, activation= "relu"))
model.add(Dense(1))
model.summary() #Print model Summary

We use Mean Squared Error as the loss function, because we're dealing with a regression problem.

We use the optimizer Adam, it uses an adaptive learning rate. Check Keras' documentation for details.

In [None]:
# Compile model
model.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])

Train the Neural Network.

The large discrepancy in the loss function can be explained by the adaptive learning rate. At first, the learning rate is too large, resulting in no convergence. At a certain epoch, the learning rate is at a level which is useful to train the Neural Network.

Compared to traditional Machine Learning techniques like Random Forests and Boosting, Neural Networks are a black box. We're not really sure why it works the way it does. This makes understanding our training process harder. 

It's more difficult to use our understanding of the data to optimize a Neural Network.

In [None]:
# Fit Model
model.fit(X_train, y_train, epochs=100)

In [None]:
# Evaluation while fitting the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

### Model comparison
Compare the mean squared error metric for the Random Forest and the Neural Network

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
#Prediction using Random Forest 
y_valid_rf = rf.predict(X_valid)
score = np.sqrt(mean_squared_error(y_valid,y_valid_rf))
print (score)

In [None]:
#Prediction using Neural Network
y_valid_nn = model.predict(X_valid)
score = np.sqrt(mean_squared_error(y_valid,y_valid_nn))
print (score)

We see that the random forest out of the box is a lot better than the neural network out of the box