In [269]:

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
import json
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import (
    LinearRegression, LogisticRegression,
    Ridge, Lasso, ElasticNet, SGDClassifier, SGDRegressor
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import GridSearchCV
import train_utils
from train_utils import *

In [270]:
data_path = ""
data = ""
with open('jsonFile.json', 'r') as json_file:
    loadData = json.load(json_file)
    data = loadData['design_state_data']
    data_path = data['session_info'].get('dataset')
    target = data['target']['target']
    task_type = data['target']['type']
    
print(data_path)
print(f'target : {target}')
print(f'task type : {task_type}')

iris_modified.csv
target : petal_width
task type : regression


In [271]:
#read the data
df = pd.read_csv(f'data/{data_path}')
print(f'shape = {df.shape}')
df.head()

shape = (150, 5)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [272]:
for key in data.keys():
    print(key)

session_info
target
train
metrics
feature_handling
feature_generation
feature_reduction
hyperparameters
weighting_stratergy
probability_calibration
algorithms


### FEATURE HANDLING
- handling missing values

In [273]:
def feature_handling(df, config):
    df_copy = df.copy() #make a copy of df
    processed = {}

    for col, val in config.items():
        if not val["is_selected"]:
            continue
        details = val["feature_details"]
        col_type = val["feature_variable_type"]

        # Imputation for numerical feature
        if col_type == "numerical":
            computation_type = details.get("impute_with")
            if computation_type == "Average of values":
                df_copy[col] = df_copy[col].fillna(df_copy[col].mean())
            elif computation_type == "custom":
                df_copy[col] = df_copy[col].fillna(details.get("impute_value"))

            # Rescaling if that exist 
            rescale = details.get("rescaling", "No rescaling")
            if rescale == "StandardScaler":
                df_copy[col] = StandardScaler().fit_transform(df_copy[[col]])
            elif rescale == "MinMaxScaler":
                df_copy[col] = MinMaxScaler().fit_transform(df_copy[[col]])

            processed[col] = df_copy[col]

        # Encoding for text
        elif col_type == "text":
            encoded = LabelEncoder().fit_transform(df_copy[col].astype(str))
            processed[col] = encoded

    return pd.DataFrame(processed)


In [274]:
x = feature_handling(df, data['feature_handling'])
print(x.shape)
x

(150, 5)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


### Feature Reduction

In [275]:

def feature_reduction(X, y, cfg, task="regression"):
    if cfg.get("feature_reduction_method") == "Tree-based":
        model = RandomForestRegressor if task == "regression" else RandomForestClassifier
        rf = model(
            n_estimators=int(cfg["num_of_trees"]),
            max_depth=int(cfg["depth_of_trees"]),
            random_state=42
        )
        rf.fit(X, y)
        idx = rf.feature_importances_.argsort()[-int(cfg["num_of_features_to_keep"]):]
        return X.iloc[:, idx]

    return X


In [276]:
print(x.isnull().sum())

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64


In [277]:
target_col = data["target"]["target"]
y = df[target_col]
print(x.shape, y.shape)

(150, 5) (150,)


## DEFINE THE PIPELINE

In [281]:
algo = data['algorithms']
def Pipeline(df, data, model_registry, get_param_grid):
    #step1: feature handling
    feature_handling(df, data['feature_handling'])
    
    #step2: feature reduction
    feature_reduction(x, y, data['feature_reduction'], task_type)
    
    #step3: data splitting
    x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size= 0.2,
                                                    random_state= data['train']['random_seed'])
    
    #Step4: Hyoerparameter tuning
    train_selected_model(x_train, x_test, y_train, y_test, algo, model_registry, get_param_grid, data['hyperparameters'])


In [282]:
Pipeline(df, data, model_registry, get_param_grid)

Here, we are using RandomForestRegressor model.
Fitting 6 folds for each of 8 candidates, totalling 48 fits
 Best Params for RandomForestRegressor: {'max_depth': 20, 'min_samples_leaf': 5, 'n_estimators': 20}
 Test Score: 0.9967578922208405

