In [None]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

path = "./data/train.csv"
kaggle_train_df = pd.read_csv(path)
path = "./data/test.csv"
kaggle_test_df = pd.read_csv(path)

### Select Some Columns

In [None]:
num_cols = [
    '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'FullBath', 'HalfBath',
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 
    'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold',
    'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond'
]

cat_cols = [
    "CentralAir", "ExterQual", "LandSlope", "Condition2", "ExterCond",
    "LandContour", "HouseStyle", "BldgType", "RoofStyle", "Foundation",
    "GrLivArea", "RoofMatl", "Id", "Condition1"
]

col_selected = {key:[key] for key in num_cols + cat_cols}
col_y = "SalePrice"

### Specify Where the Files Are to Be Saved

In [None]:
cwd = os.getcwd()
dbhost = cwd + "/local_documents"
home_path = cwd + "/local_storage"

project = "ml_forest_dev_sample"

db = {"host": dbhost, "project": project}
filepaths = [{"home": home_path, "project": project}]

### Load the Sklearn Classes to Be Used

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso
from sklearn.svm import SVR

### Wrap Up the Sklearn Classes for Usage in `ml_forest`

In [None]:
from ml_forest.core.elements.ftrans_base import SklearnRegressor
from ml_forest.core.elements.ftrans_base import SklearnUnsupervised

class GenerateStandardScalor(SklearnUnsupervised):
    def __init__(self):
        super(GenerateStandardScalor, self).__init__(model_type=StandardScaler)
        self.__essentials = {}
    
class GenerateOneHotEncode(SklearnUnsupervised):
    def __init__(self):
        super(GenerateOneHotEncode, self).__init__(model_type=OneHotEncoder, sparse=False)
        self.__essentials = {}

class GenerateLasso(SklearnRegressor):
    def __init__(self, **kwargs):
        super(GenerateLasso, self).__init__(model_type=Lasso, **kwargs)
        self.__essentials = {}
        
class GenerateSVR(SklearnRegressor):
    def __init__(self, **kwargs):
        
        super(GenerateSVR, self).__init__(model_type=SVR, **kwargs)
        self.__essentials = {}

### Initialize the Pipe

In [None]:
from ml_forest.pipeline.pipe_init import PipeInit

train_init = PipeInit(
    data=kaggle_train_df, col_y=col_y, col_selected=col_selected,
    lst_layers=[2,3],
    db=db, filepaths = filepaths
)

core_docs = train_init.core
init_fnodes = train_init.init_fnodes
init_lnode = train_init.init_lnode

### Creating Nodes

In [None]:
from ml_forest.pipeline.nodes.stacking_node import FNode, LNode

# Nodes that one-hot encode categorical features
dummied_fnodes = []
for key in cat_cols:
    cat = init_fnodes[key]
    one_hot_encoder = GenerateOneHotEncode()
    
    dummied_fnodes.append(
        FNode(core_docs, [cat], one_hot_encoder)
    )

# nodes that normalize numerical features
num_fnodes = []
for key in num_cols:
    num = init_fnodes[key]
    standardizer = GenerateStandardScalor()
    
    num_fnodes.append(
        FNode(core_docs, [num], standardizer)
    )

# Node that generates new features from one-hot encoded cetegorical features by lasso
lasso = GenerateLasso()
lasso_node = FNode(core_docs, dummied_fnodes, lasso, init_lnode)

# Node that makes final output from normalized numerical features and one-hot encoded 
# categorical features by SVR
svr = GenerateSVR()
svr_node = FNode(core_docs, num_fnodes+[lasso_node], svr, init_lnode)

### Connect the Nodes to Output Features

In [None]:
from ml_forest.pipeline.links.knitor import Knitor

kn = Knitor()
svr_feature, svr = kn.f_knit(svr_node)

In [None]:
svr_feature.values

In [None]:
lasso = GenerateLasso(alpha=0.01)
lasso_node = FNode(core_docs, dummied_fnodes, lasso, init_lnode)

svr = GenerateSVR(degree=1, C=10)
svr_node = FNode(core_docs, num_fnodes+[lasso_node], svr, init_lnode)

kn = Knitor()
svr_feature, svr = kn.f_knit(svr_node)
svr_feature.values

In [None]:
svr = GenerateSVR(degree=1, C=100)
svr_node = FNode(core_docs, num_fnodes+[lasso_node], svr, init_lnode)

kn = Knitor()
svr_feature, svr = kn.f_knit(svr_node)
svr_feature.values

### Summarise Models

In [None]:
from ml_forest.core.utils.docs_init import root_database
from bson.objectid import ObjectId

f_collection = root_database[project]["Feature"]
ft_collection = root_database[project]["FTransform"]

def get_content(doc, core):
    if "f_transform" in doc["essentials"] and doc["essentials"]["f_transform"]:
        ft_id = doc["essentials"]["f_transform"]
        found = [d for d in ft_collection if d["_id"]==ft_id]
        ft_doc = found[0]
        
        ft_name = str(ft_doc["essentials"]["type"]).split(".")[-1]
        ft_name = ''.join(x for x in ft_name if x.isalpha())
        
        return ft_name
    else:
        f_id = doc["_id"]
        dict_f = core.init_features.copy()
 
        tmp = [key for key in dict_f if dict_f[key] == f_id]
        if tmp:
            f_name = tmp[0]
        else:
            f_name = None
        
        return f_name

def collection_single_f(f_id, core):    
    found = [d for d in f_collection if d["_id"]==f_id]
    if len(found)> 1:
        raise ValueError("There are more than one document with the objectid you passed")

    f_doc = found[0]
        
    content = get_content(f_doc, core)
    if f_doc["essentials"]["lst_fed"]:
        child_f_id = set(f_doc["essentials"]["lst_fed"])
    else:
        child_f_id = set()
    
    return {"_id":f_id, "child": child_f_id, "content":content}


def collect4fid(f_id, core):
    to_be_searched = {f_id}
    
    i = 0
    result = {}
    while to_be_searched:
        result_layer = []
        tmp = set()
        for id_2b_searched in to_be_searched:
            doc = collection_single_f(id_2b_searched, core)
            result_layer.append(doc)
            tmp = tmp.union(set(doc["child"]))
        result[i] = result_layer
        to_be_searched = tmp
        i+=1
        
    return result

In [None]:
f_id =  svr_feature.obj_id
docs = collect4fid(f_id, core_docs)
docs