## Preparation

You can skip the preparation, click <a href="#here">here</a> to jump to the `mlforest` part.

### Load Data

In [1]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

path = "./data/train.csv"
kaggle_train_df = pd.read_csv(path)
path = "./data/test.csv"
kaggle_test_df = pd.read_csv(path)

### Select Some Columns

In [2]:
num_cols = [
    '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'FullBath', 'HalfBath',
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
    'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold',
    'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond'
]

cat_cols = [
    "CentralAir", "ExterQual", "LandSlope", "Condition2", "ExterCond",
    "LandContour", "HouseStyle", "BldgType", "RoofStyle", "Foundation",
    "GrLivArea", "RoofMatl", "Id", "Condition1"
]

col_selected = {key:[key] for key in num_cols + cat_cols}
col_y = "SalePrice"

### Specify Where the Files Are to Be Saved

In [None]:
cwd = os.getcwd()
dbhost = cwd + "/local_documents"
home_path = cwd + "/local_storage"

project = "ml_forest_dev_sample"

db = {"host": dbhost, "project": project}
filepaths = [{"home": home_path, "project": project}]

### Load the Sklearn Classes to Be Used

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

## Modeling <p><a name="here"></a></p>

### Wrap Up the Sklearn Classes for Usage in `ml_forest`

- So that the models on the first layer and their predictions will be saved and indexed.

In [None]:
from ml_forest.core.elements.ftrans_base import SklearnRegressor
from ml_forest.core.elements.ftrans_base import SklearnUnsupervised

class GenerateStandardScalor(SklearnUnsupervised):
    def __init__(self):
        super(GenerateStandardScalor, self).__init__(model_type=StandardScaler)
        self.__essentials = {}
    
class GenerateOneHotEncode(SklearnUnsupervised):
    def __init__(self):
        super(GenerateOneHotEncode, self).__init__(model_type=OneHotEncoder, sparse=False)
        self.__essentials = {}

class GenerateLasso(SklearnRegressor):
    def __init__(self, **kwargs):
        super(GenerateLasso, self).__init__(model_type=Lasso, **kwargs)
        self.__essentials = {}
        
class GenerateSVR(SklearnRegressor):
    def __init__(self, **kwargs):
        
        super(GenerateSVR, self).__init__(model_type=SVR, **kwargs)
        self.__essentials = {}
        
class GenerateRF(SklearnRegressor):
    def __init__(self, **kwargs):
        
        super(GenerateRF, self).__init__(model_type=RandomForestRegressor, **kwargs)
        self.__essentials = {}

### Initialize the Pipe

In [None]:
from ml_forest.pipeline.pipe_init import PipeInit

train_init = PipeInit(
    data=kaggle_train_df, col_y=col_y, col_selected=col_selected,
    lst_layers=[2,3],
    db=db, filepaths = filepaths
)

core_docs = train_init.core
init_fnodes = train_init.init_fnodes
init_lnode = train_init.init_lnode

### Code Below Creates the Following Stacking Structure 

<img src="./img/img1.png" height=250 width=250>

In [None]:
from ml_forest.pipeline.nodes.stacking_node import FNode, LNode

# Nodes that one-hot encode categorical features
dummied_fnodes = []
for key in cat_cols:
    cat = init_fnodes[key]
    one_hot_encoder = GenerateOneHotEncode()
    
    dummied_fnodes.append(
        FNode(core_docs, [cat], one_hot_encoder)
    )

# nodes that normalize numerical features
num_fnodes = []
for key in num_cols:
    num = init_fnodes[key]
    standardizer = GenerateStandardScalor()
    
    num_fnodes.append(
        FNode(core_docs, [num], standardizer)
    )

# Node that generates new features from one-hot encoded cetegorical features by lasso
lasso = GenerateLasso()
lasso_node = FNode(core_docs, dummied_fnodes, lasso, init_lnode)

# Node that generates new features from numerical features by rf
rf = GenerateRF()
rf_node = FNode(core_docs, num_fnodes, rf, init_lnode)

# Node that makes final output from normalized numerical features and one-hot encoded 
# categorical features by SVR
svr = GenerateSVR()
svr_node = FNode(core_docs, [rf_node, lasso_node], svr, init_lnode)

### Train the Whole Stacking Models Defined Above

- **Notice that the meta data of all new trained models will be saved when the code in the cell below is executed**

In [None]:
import time
from ml_forest.pipeline.links.knitor import Knitor

kn = Knitor()

start = time.time()
svr_feature, svr = kn.f_knit(svr_node)
print(time.time()-start)
svr_feature.values

### Now Retrain the Stacking Structure with Only `lasso` and `svr` Changed

<img src="./img/img2.png" height=250 width=250>

- **Notice that the meta data of `lasso` and `svr` will be saved when the code in the cell below is executed**

In [None]:
lasso = GenerateLasso(alpha=0.01)
lasso_node = FNode(core_docs, dummied_fnodes, lasso, init_lnode)

svr = GenerateSVR(degree=1, C=10)
svr_node = FNode(core_docs, [rf_node, lasso_node], svr, init_lnode)

kn = Knitor()
start = time.time()
svr_feature, svr = kn.f_knit(svr_node)
print(time.time()-start)
svr_feature.values

### Now Retrain the Stacking Structure with Only `svr` Changed

<img src="./img/img3.png" height=250 width=250>

- **Notice that the meta data of `svr` will be saved when the code in the cell below is executed**

In [None]:
svr = GenerateSVR(degree=1, C=100)
svr_node = FNode(core_docs, num_fnodes+[lasso_node], svr, init_lnode)

kn = Knitor()
start = time.time()
svr_feature, svr = kn.f_knit(svr_node)
print(time.time()-start)
svr_feature.values