In [1]:
import os

In [2]:
os.chdir('..')

<img src="flow_0.png">

In [3]:
import numpy as np
from flows.flows import Flows

[1m[35mWelcome to the Data Science Package. First create an object as follows:[m[m
[1m[35mFor example, use the code below to import the flow 0:[m[m
[32m[40mflow = Flows(0)[m
[1m[35mYou can define the `categorical_threshold` which is the maximum number of categories that a categorical feature should have before considering it as continuous numeric feature. The default value is 50[m[m
[1m[35mFor example, use the code below to import the flow 0 with defining the categorical_threshold as 50[m[m
[32m[40mflow = Flows(flow_id=0, categorical_threshold=50)[m


In [4]:
flow = Flows(flow_id=0, categorical_threshold=10)

[1m[35mPlease use the following function to read the data[m[m
[32m[40mdataframe_dict, columns_set = flow.load_data(path : str, files_list : list)[m
[1m[35mFor example: [m[32m[40mpath = './data'[m[m
[1m[35mIf your data is in a nested directory, it is better to os.path.join. For example: [m[32m[40mpath = os.path.join('data', 'flow_0')[m[m
[1m[35mFor example: [m[32m[40mfiles_list = ['train.csv','test.csv'][m[m
[1m[35mThe output is a dictionary that contains dataframes e.g.  [m[m
[34mdataframe_dict, columns_set = {'train': train_dataframe,'test': test_dataframe}[m
[1m[35mIf you want to explore the data you can run one of the following functions: [m[m
[1m[35m1 . [m[32m[40mflow.exploring_data(dataframe_dict: dict, key_i: str)[m[m
[1m[35mFor example: [m[32m[40mflow.exploring_data(dataframe_dict, 'train')[m[m
[1m[35m2 . [m[32m[40mflow.comparing_statistics(dataframe_dict: dict)[m[m
[1m[35mFor example: [m[32m[40mflow.comparing_stati

In [None]:
path = os.path.join('data','flow_0')
files_list = ["train.csv","test.csv"]

In [None]:
dataframe_dict, columns_set = flow.load_data(path, files_list)

In [None]:
 columns_set["train"].keys()

In [None]:
dataframe_dict, columns_set = flow.encode_categorical_feature(dataframe_dict)

In [None]:
ignore_columns = ['Id', 'SalePrice']

In [None]:
dataframe_dict, columns_set = flow.scale_data(dataframe_dict, ignore_columns)

In [None]:
flow.exploring_data(dataframe_dict, "train")

In [None]:
flow.comparing_statistics(dataframe_dict)

In [None]:
ignore_columns = ["Id", "SalePrice"]
columns = dataframe_dict["train"].columns
train_dataframe = dataframe_dict["train"][[x for x in columns_set["train"]["continuous"] if x not in ignore_columns]]
test_dataframe = dataframe_dict["test"][[x for x in columns_set["train"]["continuous"] if x not in ignore_columns]]
train_target = dataframe_dict["train"]["SalePrice"]

In [None]:
parameters = {
    "data": {
        "train": {"features": train_dataframe, "target": train_target.to_numpy()},
    },
    "split": {
        "method": "kfold",  # "method":"kfold"
        "fold_nr": 5,  # fold_nr:5 , "split_ratios": 0.3 # "split_ratios":(0.3,0.2)
    },
    "model": {"type": "Ridge linear regression",
              "hyperparameters": {"alpha": "optimize",  # alpha:optimize
                                  },
              },
    "metrics": ["r2_score"],
    "predict": {
        "test": {"features": test_dataframe}
    }
}

In [None]:
model_index_list, save_models_dir, y_test = flow.training(parameters)

In [None]:
parameters_lighgbm = {
    "data": {
        "train": {"features": train_dataframe, "target": train_target.to_numpy()},
    },
    "split": {
        "method": "kfold",  # "method":"kfold"
        "fold_nr": 5,  # fold_nr:5 , "split_ratios": 0.3 # "split_ratios":(0.3,0.2)
    },
    "model": {"type": "lightgbm",
              "hyperparameters": dict(objective='regression', metric='root_mean_squared_error', num_leaves=5,
                                      boost_from_average=True,
                                      learning_rate=0.05, bagging_fraction=0.99, feature_fraction=0.99, max_depth=-1,
                                      num_rounds=10000, min_data_in_leaf=10, boosting='dart')
              },
    "metrics": ["mean_squared_error"],
    "predict": {
        "test": {"features": test_dataframe}
    }
}

In [None]:
model_index_list, save_models_dir, y_test = flow.training(parameters_lighgbm)

In [None]:
parameters_xgboost = {
    "data": {
        "train": {"features": train_dataframe, "target": train_target.to_numpy()},
    },
    "split": {
        "method": "kfold",  # "method":"kfold"
        "fold_nr": 5,  # fold_nr:5 , "split_ratios": 0.3 # "split_ratios":(0.3,0.2)
    },
    "model": {"type": "xgboost",
              "hyperparameters": {'max_depth': 5, 'eta': 1, 'eval_metric': "rmse", "num_round": 100}
              },
    "metrics": ["r2_score", "mean_squared_error"],
    "predict": {
        "test": {"features": test_dataframe}
    }
}

In [None]:
model_index_list, save_models_dir, ytest = flow.training(parameters_xgboost)

In [None]:
parameters_sklearn = {
    "data": {
        "train": {"features": train_dataframe, "target": train_target.to_numpy()},
    },
    "split": {
        "method": "split",
        "split_ratios": 0.2,
        "stratify": False  # set to True only for classification tasks
    },
    "model": {
        "type": "sklearn.ensemble.RandomForestRegressor",
        "hyperparameters": {
            'params_grid':{
                'criterion': ["mse", "mae"],
                'max_depth': [5, 10, 15, 999],
                'min_samples_leaf': [4, 1],
                'max_depth': [4, 8, 12],
            },
            'params_fixed': {
                'min_samples_split': 10, 
                'random_state': 11
            },
            'params_cv': {
                'n_splits': 5, 
                'shuffle': True, 
                'random_state': 11
            },
            'objective': 'regression',  # 'classification'
            "grid_search_scoring": ['r2', 'neg_mean_squared_error']
        },
    },
    "metrics": ["r2_score", "mean_squared_error"],
    "predict": {
        "test": {"features": test_dataframe}
    }
}

model_index_list, save_models_dir, y_test = flow.training(parameters_sklearn)

In [None]:
parameters_sklearn = {
    "data": {
        "train": {"features": train_dataframe, "target": train_target.to_numpy()},
    },
    "split": {
        "method": "split",
        "split_ratios": 0.2,
        "stratify": False  # set to True only for classification tasks
    },
    "model": {
        "type": "sklearn.linear_model.ElasticNet",
        "hyperparameters": {
            'params_grid':{
                'alpha': np.logspace(-3,3,7),
                'l1_ratio': np.linspace(0, 0., num=4)+0.01
            },
            'params_fixed': {
                'normalize': True, 
                'max_iter': 2000,
                'random_state': 11
            },
            'params_cv': {
                'n_splits': 5, 
                'shuffle': True, 
                'random_state': 11
            },
            'objective': 'regression',  # 'classification'
            "grid_search_scoring": ['r2', 'neg_mean_squared_error']
        },
    },
    "metrics": ["r2_score", "mean_squared_error"],
    "predict": {
        "test": {"features": test_dataframe}
    }
}

model_index_list, save_models_dir, y_test = flow.training(parameters_sklearn)