In [36]:
import pandas as pd
import numpy as np
import json
import re
# %pip install striprtf
from striprtf.striprtf import rtf_to_text

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier, ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet, SGDClassifier, SGDRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, TimeSeriesSplit
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

# Parse JSON file from RTF
with open("algoparams_from_ui.json.rtf", "r") as file:
    rtf_content = file.read()

text = rtf_to_text(rtf_content)
json_string = re.search(r'\{.*\}', text, re.DOTALL)[0]
data = json.loads(json_string)


In [11]:
import os
# Change to the Downloads folder
os.chdir(os.path.expanduser("~\\Downloads"))

### Reading the target and type of regression 

In [28]:
# Loading csv file (iris dataset)
csv_file = data["design_state_data"]["session_info"]["dataset"]
df = pd.read_csv(csv_file)

target_info = data["design_state_data"]["target"]
target_variable = target_info.get("target")
prediction_type = target_info.get("prediction_type").lower()

### Reading the features (which are column names in the csv) and figure out what missing imputation applied to the columns loaded in a dataframe
### Created a custom class called Custom_Feature_Handling

In [30]:
# feature handling code
class Custom_Feature_Handling(BaseEstimator, TransformerMixin):
    def __init__(self, feat_handling):
        self.feat_handling = feat_handling

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        feat_handling = self.feat_handling

        for feat_name, feat_info in feat_handling.items():
            if not feat_info.get("is_selected", False):
                continue

            feat_type = feat_info.get("feature_variable_type")

            # Imputing for numerical variable according to extracted impute_strategy and impute_value
            
            if feat_type == "numerical":
                feat_details = feat_info["feature_details"]
                impute_strategy = feat_details.get("impute_with")
                impute_value = feat_details.get("impute_value")

                if impute_strategy == "Average of values":
                    mean_val = df[feat_name].mean()
                    df[feat_name].fillna(mean_val, inplace=True)

                elif impute_strategy == "custom":
                    df[feat_name].fillna(impute_value, inplace=True)

            # Imputing for text variable according to extracted impute_strategy and impute_value
            elif feat_type == "text":
                feat_details = feat_info.get("feature_details", {})
                text_handling = feat_details.get("text_handling", "")
                hash_columns = feat_details.get("hash_columns", 4) or 4  # Default to 4 if 0 or None

                # Fill missing_text in place of missing 
                df[feat_name].fillna("missing_text", inplace=True)

                # Converting text data into fixed-length numeric features using hashing
                if text_handling == "Tokenize and hash":

                    vectorizer = HashingVectorizer(n_features=hash_columns, alternate_sign=False)
                    hashed = vectorizer.fit_transform(df[feat_name]).toarray()

                    # Creating new column names for hashed output
                    hashed_df = pd.DataFrame(hashed, columns=[f"{feat_name}_hash_{i}" for i in range(hash_columns)])
                    hashed_df.index = df.index  # Align index

                    # Droping original text column and adding hashed columns
                    df.drop(columns=[feat_name], inplace=True)
                    df = pd.concat([df, hashed_df], axis=1)

        return df

### Computing feature reduction based on input where there can be No Reduction, Corr with Target, Tree-based, PCA.
### Created a custom class called Custom_Feature_Reduction  

In [32]:
# feature reduction code
class Custom_Feature_Reduction(BaseEstimator, TransformerMixin):
    def __init__(self, feat_reduction, prediction_type):
        self.feat_reduction = feat_reduction
        self.prediction_type = prediction_type.lower()
        self.columns_ = None
        self.reduction_model_ = None
        self.selected_columns_ = None

    def fit(self, X, y=None):
        method = self.feat_reduction["feature_reduction_method"]
        self.columns_ = X.columns

        if method == "Correlation with target" and y is not None:
            k = self.feat_reduction.get("Correlation with target", {}).get("num_of_features_to_keep", 4)
            self.reduction_model_ = SelectKBest(score_func=f_regression, k=k)
            self.reduction_model_.fit(X, y)

        elif method == "Tree-based" and y is not None:
            k = int(self.feat_reduction.get("num_of_features_to_keep", 4))
            depth = int(self.feat_reduction.get("depth_of_trees", 5))
            trees = int(self.feat_reduction.get("num_of_trees", 10))
            rf = RandomForestRegressor(n_estimators=trees, max_depth=depth, random_state=42)
            rf.fit(X, y)
            importances = rf.feature_importances_
            top_k_idx = importances.argsort()[-k:][::-1]
            self.selected_columns_ = X.columns[top_k_idx]

        elif method == "Principal Component Analysis":
            k = self.feat_reduction.get("Principal Component Analysis", {}).get("num_of_features_to_keep", 4)
            self.reduction_model_ = PCA(n_components=int(k))
            self.reduction_model_.fit(X)

        elif method == "No Reduction":
            k = self.feat_reduction.get("No Reduction", {}).get("num_of_features_to_keep", X.shape[1])
            self.selected_columns_ = X.columns[:k]

        return self

    def transform(self, X):
        method = self.feat_reduction["feature_reduction_method"]

        if method == "Correlation with target":
            return pd.DataFrame(self.reduction_model_.transform(X), index=X.index)

        elif method == "Tree-based":
            return X[self.selected_columns_]

        elif method == "Principal Component Analysis":
            pca_data = self.reduction_model_.transform(X)
            return pd.DataFrame(pca_data, index=X.index, columns=[f"PCA_{i+1}" for i in range(pca_data.shape[1])])

        elif method == "No Reduction":
            return X[self.selected_columns_]

        else:
            return X

### MODEL_tune(): Mapping of models to hyperparameters

In [38]:
# tuning hyperparameters of each models
MODEL_tune = {
    "RandomForestRegressor": (RandomForestRegressor, {
        "n_estimators": "range(min_trees,max_trees)",
        "max_depth": "range(min_depth,max_depth)",
        "min_samples_leaf": "range(min_samples_per_leaf_min_value,min_samples_per_leaf_max_value)"
    }),
    "RandomForestClassifier": (RandomForestClassifier, {
        "n_estimators": "range(min_trees,max_trees)",
        "max_depth": "range(min_depth,max_depth)",
        "min_samples_leaf": "range(min_samples_per_leaf_min_value,min_samples_per_leaf_max_value)"
    }),
    "GBTRegressor": (GradientBoostingRegressor, {
        "n_estimators": "range(min_iter,max_iter)",
        "max_depth": "range(min_depth,max_depth)",
        "learning_rate": "range_float(min_stepsize,max_stepsize,0.1)"
    }),
    "GBTClassifier": (GradientBoostingClassifier, {
        "n_estimators": "range(min_iter,max_iter)",
        "max_depth": "range(min_depth,max_depth)",
        "learning_rate": "range_float(min_stepsize,max_stepsize,0.1)"
    }),
    "LinearRegression": (LinearRegression, {}),
    "LogisticRegression": (LogisticRegression, {
        "max_iter": "range(min_iter,max_iter)"
    }),
    "RidgeRegression": (Ridge, {
        "alpha": "range_float(min_regparam,max_regparam,0.1)"
    }),
    "LassoRegression": (Lasso, {
        "alpha": "range_float(min_regparam,max_regparam,0.1)"
    }),
    "ElasticNetRegression": (ElasticNet, {
        "alpha": "range_float(min_regparam,max_regparam,0.1)",
        "l1_ratio": "range_float(min_elasticnet,max_elasticnet,0.1)"
    }),
    "DecisionTreeRegressor": (DecisionTreeRegressor, {
        "max_depth": "range(min_depth,max_depth)",
        "min_samples_leaf": "from_list(min_samples_per_leaf)"
    }),
    "DecisionTreeClassifier": (DecisionTreeClassifier, {
        "max_depth": "range(min_depth,max_depth)",
        "min_samples_leaf": "from_list(min_samples_per_leaf)"
    }),
    "xg_boost": (XGBRegressor, {
        "max_depth": "from_list(max_depth_of_tree)",
        "learning_rate": "from_list(learningRate)",
        "reg_alpha": "from_list(l1_regularization)",
        "reg_lambda": "from_list(l2_regularization)",
        "gamma": "from_list(gamma)",
        "min_child_weight": "from_list(min_child_weight)",
        "subsample": "from_list(sub_sample)",
        "colsample_bytree": "from_list(col_sample_by_tree)"
    }),
    
    "SGD": (SGDClassifier, {
        "alpha": "from_list(alpha_value)",
        "max_iter": "range(max_iterations, max_iterations+100, 100)", 
        "tol": "from_list(tolerance)"
    }),

    "extra_random_trees": (ExtraTreesRegressor, {
        "n_estimators": "from_list(num_of_trees)",
        "max_depth": "from_list(max_depth)",
        "min_samples_leaf": "from_list(min_samples_per_leaf)"
    }),
    "SVM": (SVC, {
        "C": "from_list(c_value)"
    }),
    "KNN": (KNeighborsRegressor, {
        "n_neighbors": "from_list(k_value)"
    }),
    "neural_network": (MLPRegressor, {
        "hidden_layer_sizes": "from_list(hidden_layer_sizes)",
        "learning_rate_init": "range_float(initial_learning_rate, initial_learning_rate+1, 0.1)",
        "max_iter": "range(max_iterations,max_iterations+100,100)"
    }),
}

### build_pmt_grid(): Creates the actual parameter grid

In [40]:
def build_pmt_grid(pmt_definitions, model_value):
    
    pmt_grid = {}
    if model_value is None:
        print("model_value is None. Cannot build parameter grid.")
        return pmt_grid

    for pmt, definition in pmt_definitions.items():
        try:
            
            if definition.startswith("range("):
                # Handle integer ranges like "range(min_trees,max_trees)"
                match = re.match(r"range\((\w+),(\w+)\)", definition)
                if match:
                    min_val_key, max_val_key = match.groups()
                    min_val = model_value.get(min_val_key)
                    max_val = model_value.get(max_val_key)
                    if min_val is not None and max_val is not None:
                        pmt_grid[pmt] = list(range(int(min_val), int(max_val) + 1))
                    else:
                        print(f"Could not find min/max values for range for parameter {pmt}.")
                        param_grid[param] = [model_value.get(param)] # Fallback
                else:
                     print(f"Unexpected range format for parameter {pmt}.")
                     pmt_grid[pmt] = [model_value.get(pmt)] 

            elif definition.startswith("range_float("):
                # Handle float ranges like "range_float(min_stepsize,max_stepsize,0.1)"
                match = re.match(r"range_float\((\w+),(\w+),(\d*\.?\d+)\)", definition)
                if match:
                    min_val_key, max_val_key, step_str = match.groups()
                    min_val = model_value.get(min_val_key)
                    max_val = model_value.get(max_val_key)
                    step = float(step_str)
                    if min_val is not None and max_val is not None:
                         pmt_grid[pmt] = np.arange(min_val, max_val + step, step).tolist()
                    else:
                        print(f"Could not find min/max values for float range for parameter {pmt}.")
                        param_grid[param] = [model_value.get(param)] # Fallback
                else:
                     print(f"Warning: Unexpected float range format for parameter {param}.")
                     pmt_grid[pmt] = [model_value.get(pmt)] 

            elif definition.startswith("from_list("):
                # Handle parameters from a list like "from_list(min_samples_per_leaf)"
                match = re.match(r"from_list\((\w+)\)", definition)
                if match:
                    list_key = match.group(1)
                    pmt_list = model_value.get(list_key)
                    if pmt_list is not None:
                         pmt_grid[pmt] = pmt_list
                    else:
                        print(f"Could not find list for parameter {param}.")
                        pmt_grid[pmt] = [model_value.get(pmt)] 
                else:
                    print(f"Unexpected list format for parameter {param}.")
                    pmt_grid[pmt] = [model_value.get(pmt)] 

            else:
                # Handle direct values or other type of format of range
                pmt_grid[pmt] = [model_value.get(pmt)]

        except Exception as e:
            print(f"Could not process parameter definition for {pmt}: {e}")
            # error handling if processing fails
            pmt_grid[pmt] = [model_value.get(pmt)] # Using the default value 

    return pmt_grid

### Custom ModelSelectionWithGridSearch class automates model building, hyperparameter tuning, and selection using GridSearchCV

In [42]:
# grid search cv
class ModelSelectionWithGridSearch(BaseEstimator):
    def __init__(self, data, prediction_type):
        self.data = data
        self.prediction_type = prediction_type.lower()
        self.model_objects = []
        self.selected_models_ = []

    def fit(self, X, y):
        data = self.data
        pred_type = self.prediction_type
        algo = data['design_state_data']['algorithms']

        self.model_objects = []

        for algo_key, algo_val in algo.items():
            if not algo_val.get('is_selected', False):
                continue  # Skip unselected

            model_name = algo_val['model_name'].lower()


            # REGRESSION MODELS

            if pred_type == "regression":
                if algo_key == "RandomForestRegressor":
                    model = RandomForestRegressor(
                            n_estimators=algo_val.get("max_trees", 100),
                            max_depth=algo_val.get("max_depth", None),
                            min_samples_leaf=algo_val.get("min_samples_per_leaf_min_value", 1),
                            random_state=42
                        )
                elif algo_key == "GBTRegressor":
                    model = GradientBoostingRegressor(
                            n_estimators=algo_val.get("max_iter", 100),
                            max_depth=algo_val.get("max_depth", 3),
                            learning_rate=algo_val.get("min_stepsize", 0.1),
                            random_state=42
                        )
                elif algo_key == "LinearRegression":
                    model = LinearRegression()
                elif algo_key == "RidgeRegression":
                    model = Ridge(alpha=algo_val.get("min_regparam", 1.0))
                elif algo_key == "LassoRegression":
                    model = Lasso(alpha=algo_val.get("min_regparam", 1.0))
                elif algo_key == "ElasticNetRegression":
                    model = ElasticNet(
                        alpha=algo_val.get("min_regparam", 1.0),
                        l1_ratio=algo_val.get("min_elasticnet", 0.5)
                    )
                elif algo_key == "DecisionTreeRegressor":
                    min_samples_leaf_val = algo_val.get("min_samples_per_leaf", [1])
                    if isinstance(min_samples_leaf_val, list) and len(min_samples_leaf_val) > 0:
                        min_samples_leaf_val = min(min_samples_leaf_val)
                    else:
                        min_samples_leaf_val = 1  # Default value

                    model = DecisionTreeRegressor(
                            max_depth=algo_val.get("max_depth", None),
                            min_samples_leaf=min_samples_leaf_val,
                            random_state=42
                        )
                elif algo_key == "extra_random_trees":
                    model = ExtraTreesRegressor(
                            n_estimators=max(algo_val.get("num_of_trees", [100])),
                            max_depth=max(algo_val.get("max_depth", [None])),
                            min_samples_leaf=min(algo_val.get("min_samples_per_leaf", [1])),
                            random_state=42
                        )
                elif algo_key == "xg_boost":
                    model = XGBRegressor(
                            max_depth=max(algo_val.get("max_depth_of_tree", [3])),
                            learning_rate=min(algo_val.get("learningRate", [0.1])),
                            n_estimators=algo_val.get("max_num_of_trees", 100),
                            verbosity=0,
                            random_state=42
                        )
                else:
                    print(f"Skipping unknown regressor: {algo_key}")
                    continue

                self.model_objects.append((algo_key, model))


            # CLASSIFICATION MODELS

            elif pred_type == "classification":
                if algo_key == "RandomForestClassifier":
                    model = RandomForestClassifier(
                            n_estimators=algo_val.get("max_trees", 100),
                            max_depth=algo_val.get("max_depth", None),
                            min_samples_leaf=algo_val.get("min_samples_per_leaf_min_value", 1),
                            random_state=42
                        )
                elif algo_key == "GBTClassifier":
                    model = GradientBoostingClassifier(
                            n_estimators=algo_val.get("max_iter", 100),
                            max_depth=algo_val.get("max_depth", 3),
                            learning_rate=algo_val.get("min_stepsize", 0.1),
                            random_state=42
                        )
                elif algo_key == "LogisticRegression":
                    model = LogisticRegression(
                            penalty="elasticnet",
                            l1_ratio=algo_val.get("min_elasticnet", 0.5),
                            C=1.0 / max(algo_val.get("min_regparam", 1.0), 1e-4),
                            solver="saga",
                            max_iter=algo_val.get("max_iter", 100)
                        )
                elif algo_key == "DecisionTreeClassifier":
                    min_samples_leaf_val = algo_val.get("min_samples_per_leaf", [1])
                    if isinstance(min_samples_leaf_val, list) and len(min_samples_leaf_val) > 0:
                        min_samples_leaf_val = min(min_samples_leaf_val)
                    else:
                        min_samples_leaf_val = 1  # Default value

                    model = DecisionTreeClassifier(
                            max_depth=algo_val.get("max_depth", None),
                            min_samples_leaf=min_samples_leaf_val,
                            random_state=42
                        )
                elif algo_key == "extra_random_trees":
                    model = ExtraTreesClassifier(
                            n_estimators=max(algo_val.get("num_of_trees", [100])),
                            max_depth=max(algo_val.get("max_depth", [None])),
                            min_samples_leaf=min(algo_val.get("min_samples_per_leaf", [1])),
                            random_state=42
                        )
                elif algo_key == "xg_boost":
                    model = XGBClassifier(
                            max_depth=max(algo_val.get("max_depth_of_tree", [3])),
                            learning_rate=min(algo_val.get("learningRate", [0.1])),
                            n_estimators=algo_val.get("max_num_of_trees", 100),
                            use_label_encoder=False,
                            eval_metric='logloss',
                            verbosity=0,
                            random_state=42
                        )
                elif algo_key == "SVM":
                    model = SVC(
                            kernel='rbf',
                            C=max(algo_val.get("c_value", [1.0])),
                            gamma='scale',
                            probability=True
                        )
                elif algo_key == "KNN":
                    model = KNeighborsClassifier(
                            n_neighbors=max(algo_val.get("k_value", [5]))
                        )
                elif algo_key == "neural_network":
                    model = MLPClassifier(
                            hidden_layer_sizes=tuple(algo_val.get("hidden_layer_sizes", [100])),
                            max_iter=algo_val.get("max_iterations", 200),
                            solver=algo_val.get("solver", "adam"),
                            early_stopping=algo_val.get("early_stopping", True),
                            random_state=42
                        )
                elif algo_key == "SGD":
                    model = SGDClassifier(
                            penalty='elasticnet',
                            alpha=min(algo_val.get("alpha_value", [0.0001])),
                            max_iter=algo_val.get("max_iterations", 1000),
                            tol=algo_val.get("tolerance", 1e-3),
                            random_state=42
                        )
                else:
                    print(f" Skipping unknown classifier: {algo_key}")
                    continue

                self.model_objects.append((algo_key, model))

        # performing GridSearchCV
        self.selected_models_ = self.run_all_grid_search(X, y, data, pred_type)

        return self

    def transform(self, X):
        return X

    def run_all_grid_search(self, X, y, data, prediction_type):
        selected_models = []
        algorithms = data["design_state_data"]["algorithms"]

        for model_key, model_value in algorithms.items():
            if not model_value.get("is_selected", False):
                continue
            if model_key not in MODEL_tune:
                print(f"Model '{model_key}' not supported in mapping.")
                continue

            model_cls, pmt_definitions = MODEL_tune[model_key]
            pmt_grid = build_pmt_grid(pmt_definitions, model_value)

            print(f"\nRunning GridSearchCV for: {model_key}")
            print(f"Param Grid: {pmt_grid}")

            # Using KFold for cross-validation 
            cv = KFold(n_splits=5, shuffle=True, random_state=1)

            grid = GridSearchCV(estimator=model_cls(), param_grid=pmt_grid, cv=cv, n_jobs=-1)
            grid.fit(X, y)

            print(f"Best model for {model_key}: {grid.best_estimator_}")
            print(f"Best score: {grid.best_score_}\n")

            selected_models.append((model_key, grid.best_estimator_))

        return selected_models

### Pipeline

In [45]:
# Keep the target variable in X for feature handling
X = df.copy()
y = df[target_variable]

In [49]:
pipeline = Pipeline([('feature_handling', Custom_Feature_Handling(
                feat_handling=data['design_state_data']['feature_handling']
            )),
            ('feature_reduction', Custom_Feature_Reduction(
                feat_reduction=data['design_state_data']['feature_reduction'],
                prediction_type=prediction_type
            )),
            ('model_selector', ModelSelectionWithGridSearch(
                data=data,
                prediction_type=prediction_type
            ))
        ])

In [51]:
pipeline.fit(X, y)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[feat_name].fillna(mean_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[feat_name].fillna(impute_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values


Running GridSearchCV for: RandomForestRegressor
Param Grid: {'n_estimators': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'max_depth': [20, 21, 22, 23, 24, 25], 'min_samples_leaf': [5, 6, 7, 8, 9, 10]}
Best model for RandomForestRegressor: RandomForestRegressor(max_depth=22, min_samples_leaf=5, n_estimators=12)
Best score: 0.9958558366100465

