In [595]:
# 1 Session_info
# 2 target
# 3 train
# 4 metrics
# 5 feature_handling
# 6 feature_generation
# 7 feature_reduction
# 8 hyperparameters
# 9 weighting_stratergy
# 10 probability_calibration
# 11 algorithms

In [596]:
import json
from striprtf.striprtf import rtf_to_text 

json_file_path = "algoparams_from_ui.json.rtf"

with open(json_file_path, 'r') as file:
    a = rtf_to_text(file.read())
    a = a.lower()
    parsed = json.loads(a)

In [597]:
### 1 Session_info
import pandas as pd

try:
    df = pd.read_csv(parsed["design_state_data"]["session_info"]["dataset"])
except:
    df = pd.read_csv("iris.csv")
    
df.columns = [i.strip() for i in df.columns]
df.columns = [i.lower() for i in df.columns]
df.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa


In [598]:
### 2 target
target_column = parsed["design_state_data"]["target"].get("target")
target_prediction_type = parsed["design_state_data"]["target"].get("prediction_type")
# independent_variables = parsed["design_state_data"]["train"].get("time_variable").split(",")
independent_variables = [i for i in df.columns if i!=target_column]

In [599]:
### 3 train
from sklearn.model_selection import KFold
def kfold_dataset_creator(df, n_splits, randomly, random_state):
    if randomly:
        kf = KFold(n_splits=n_splits, random_state=random_state, shuffle=True)
        ksets = []
        for i in kf.split(df.index):
            ksets.append(i)
        return ksets
    else:
        kf = KFold(n_splits=n_splits)
        ksets = []
        for i in kf.split(df.index):
            ksets.append(i)
        return ksets

In [600]:
### 3 train
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

def train_test_sets(df, parsed_dict):
    if parsed_dict["design_state_data"]["train"].get("policy")=="split the dataset":
        if parsed_dict["design_state_data"]["train"].get("sampling_method")=="no sampling(whole data)":
            if parsed_dict["design_state_data"]["train"].get("k_fold")==False or parsed_dict["design_state_data"]["train"].get("k_fold")=="false":
                if 0 < int(parsed_dict["design_state_data"]["train"].get("train_ratio")) < 1:
                    X_train, X_test, y_train, y_test = train_test_split(df[independent_variables], df[target_column],
                                                                        train_size=int(parsed_dict["design_state_data"]["train"].get("train_ratio")),
                                                                       random_state=int(parsed_dict["design_state_data"]["train"].get("random_seed")))
                    return 1, X_train, X_test, y_train, y_test
                else:
                    print("Changing the train_ratio to 0.8")
                    train_ratio = 0.8
                    X_train, X_test, y_train, y_test = train_test_split(df[independent_variables], df[target_column],
                                                                        train_size=train_ratio,
                                                                       random_state=int(parsed_dict["design_state_data"]["train"]["random_seed"]))
                    return 1, X_train, X_test, y_train, y_test
            else:
                n_splits = int((train_ratio/(1-train_ratio))+1)
                if n_splits<=1:
                    print("Changing the train_ratio to 0.8")
                    n_splits = 5
                randomly = parsed_dict["design_state_data"]["train"]["split"] == "randomly"
                ksets = kfold_dataset_creator(df=df, 
                                              n_splits=n_splits, 
                                              randomly=randomly, 
                                              random_state=int(parsed_dict["design_state_data"]["train"]["random_seed"]))
                return 3, ksets
        else:
            print("Create options for data sampling")
    else:
        print("Training on the whole datatset")
        X_train = df[independent_variables]
        y_train = df[target_column]
        return 2, X_train, y_train

In [601]:
from sklearn.preprocessing import StandardScaler

def standard_scale_column(X_train, X_test, column):
    ss = StandardScaler()
    scaled_train_set = ss.fit_transform(X_train[column])
    try:
        scaled_test_set = ss.transform(X_test[column])
        return scaled_train_set, scaled_test_set, ss
    except:
        return scaled_train_set, ss

def categorize_column(df_train, df_test, column_name):
    unique_values = df_train[column_name].unique()
    mapping_dict = {}
    for i,j in enumerate(unique_values, 1):
        mapping_dict[j] = i
    df_train[column_name] = df_train[column_name].map(mapping_dict)
    try:
        df_test[column_name] = df_test[column_name].map(mapping_dict)
        return df_train, df_test, mapping_dict
    except:
        return df_train, mapping_dict

In [602]:
### 5 feature_handling
def feature_handling(df_train, df_test):
    for column in parsed.get("design_state_data").get("feature_handling").keys():
        indexer = parsed.get("design_state_data").get("feature_handling").get(column)
        if indexer.get("is_selected") or indexer.get("is_selected")=="true":
            if indexer.get("feature_variable_type")=="numerical":
                if indexer.get("feature_variable_type")=="keep as regular numerical feature":
                    if indexer.get("feature_details").get("missing_values")=="impute":
                        if "average" in indexer.get("feature_details").get("impute_with"):
                            df_train.get(indexer.get("feature_name")).fillna(df_train.get(indexer.get("feature_name")).mean(), inplace=True)
                            df_test.get(indexer.get("feature_name")).fillna(df_test.get(indexer.get("feature_name")).mean(), inplace=True)
                            
                        if "forward" in indexer.get("feature_details").get("impute_with"):
                            df_train.get(indexer.get("feature_name")).fillna(method="ffill", inplace=True)
                            df_test.get(indexer.get("feature_name")).fillna(method="ffill", inplace=True)
                            
                        if "backward" in indexer.get("feature_details").get("impute_with"):
                            df_train.get(indexer.get("feature_name")).fillna(method="bfill", inplace=True)
                            df_test.get(indexer.get("feature_name")).fillna(method="bfill", inplace=True)
                            
                        else:
                            df_train.get(indexer.get("feature_name")).fillna(indexer.get("feature_details").get("impute_value"), inplace=True)
                            df_test.get(indexer.get("feature_name")).fillna(indexer.get("feature_details").get("impute_value"), inplace=True)
                    if indexer.get("feature_details").get("rescaling"):
                        df_train, df_test, ss = standard_scale_indexer(df_train, df_test, column)
                        
            elif indexer.get("feature_variable_type")=="text":
                if indexer.get("feature_details").get("text_handling")=="tokenize and hash":
                    df_train, df_test, mapping_dict = categorize_column(df_train, df_test, indexer.get("feature_name"))

            else:
                print(f"Not a numerical feature, hence skipping feature handling for {indexer.get('feature_name')}")
        else:
            df_train = df_train[[i for i in df_train.columns if i!=indexer.get("feature_name")]]
            df_test = df_test[[i for i in df_test.columns if i!=indexer.get("feature_name")]]
    return df_train, df_test

In [603]:
def process_feature_handling(df, parsed):
    train_test_return = train_test_sets(df, parsed)
    if train_test_return[0]==1:
        X_train, X_test, y_train, y_test = train_test_return[1], train_test_return[2], train_test_return[3], train_test_return[4]
        df_train = X_train.merge(y_train, left_index=True, right_index=True)
        df_test = X_test.merge(y_test, left_index=True, right_index=True)
        df_train, df_test = feature_handling(df_train, df_test)
        return 1, df_train, df_test

    elif train_test_return[0]==2:
        X_train, y_train = train_test_return[1], train_test_return[2]
        df_train = X_train.merge(y_train, left_index=True, right_index=True)
        df_train = feature_handling(df_train, -1)
        return 2, df_train

    elif train_test_return[0]==3:
        ksets = train_test_return[1]
        train_test_sets_final = []
        for i, j in enumerate(ksets):
            df_train, df_test = df.loc[j[0]], df.loc[j[1]]
            df_train, dftest = feature_handling(df_train, dftest)
            train_test_sets.append((df_train, df_test))
        return 3, train_test_sets_final

In [604]:
processed_data = process_feature_handling(df, parsed)

Changing the train_ratio to 0.8


In [605]:
if processed_data[0]==1:
    df_train, df_test = processed_data[1], processed_data[2]
elif processed_data[0]==2:
    df_train = processed_data[1]
elif processed_data[0]==3:
    ### need to perform
    print("Multiple iterations")

In [606]:
df_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,species,petal_width
137,6.4,3.1,5.5,1,1.8
84,5.4,3.0,4.5,2,1.5
27,5.2,3.5,1.5,3,0.2
127,6.1,3.0,4.9,1,1.8
132,6.4,2.8,5.6,1,2.2


In [611]:
from sklearn import decomposition

def feature_reduction(df_train, df_test, parsed):
    parsed = parsed.get("design_state_data")
    if "{" in str(parsed["feature_reduction"].values()):
        list_of_methods = list(parsed["feature_reduction"].keys())[1:]
        for each_method in list_of_methods:
            if parsed.get("feature_reduction").get(each_method).get("is_selected"):
                if "no reduction" in each_method:
                    print("No reduction being done")
                    break
                elif "correlation" in each_method:
                    corr_matrix = df_train.corr()
                    columns_to_keep = list(corr_matrix.iloc[:,-1].index)
                    corr_matrix = df_train.corr()
                    vals = [i if i>=0 else -1*i for i in corr_matrix[target_column].values]
                    corr_column_names = list(df_train.corr().columns)
                    df_corr_dict = pd.DataFrame({"correlation_value": vals, "column_name": corr_column_names})
                    df_corr_dict.sort_values('correlation_value')
                    required_count = parsed.get("feature_reduction").get("correlation with target").get("num_of_features_to_keep")
                    required_columns = df_corr_dict.iloc[:required_count-1, -1]
                    required_columns.append(target_column)
                    df_train = df_train[required_columns]
                    try:
                        df_test = df_test[required_columns]
                    except:
                        pass
                elif "tree-based" in each_method:
                    print("tree based reduction")
                elif "principal component analysis" in each_method:
                    pca = decomposition.PCA(n_components=parsed.get("feature_reduction").get("principal component analysis").get("number_of_features_to_keep"))
                    y_train = df_train.loc[:,target_column]
                    df_train = pd.DataFrame(pca.fit_transform(df_train[[i for i in df.columns if i!=target_column]]))
                    df_train = df_train.merge(y_train, left_index=True, right_index=True)
                    df_train.columns[-1] = target_column
                    try:
                        y_test = df_test.loc[:,target_column]
                        df_test = pd.DataFrame(pca.transform(df_test[[i for i in df.columns if i!=target_column]]))
                        df_test = df_test.merge(y_test, left_index=True, right_index=True)
                        df_test.columns[-1] = target_column
                    except:
                        pass                    
    else:
        print("Not specified")
    return df_train, df_test

In [612]:
df_train, df_test = feature_reduction(df_train, df_test, parsed)

Not specified


In [613]:
all_models = {"classification": ["randomforestclassifier",
                            "gbtclassifier",
                            "decisiontreeclassifier",
                            "logisticregression",
                            "knn"],
             "regression": ["randomforestregressor",
                          "gbtregressor",
                          "ridgeregression",
                           "linearregression",
                           "lassoregression",
                           "elasticnetregression",
                           "xgboost",
                           "decisiontreeregressor",
                           "svm",
                           "sgd",
                           "extrarandomtrees",
                           "neuralnetwork"]
            }

In [594]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBosstingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import ExtraTreesRegressor

def train_models(list_of_models, parsed, df_train, df_test):
    trained_models = []
    if "randomforestregressor" in list_of_models:
        rfr_dict = parsed.get("design_state_data").get("algorithms").get("randomforestregressor")
        rfr = RandomForestRegressor(n_estimators= rfr_dict.get("max_trees"), 
                                    max_depth=rfr_dict.get("max_depth"), 
                                    criterion=rfr_dict.get("feature_sampling_stratergy"))
        rfr.fit(df_train.iloc[:,:-1], df_train.iloc[:,-1])
        trained_models.append(rfr)
    if "gbtregressor" in list_of_models:
        gbtr_dict = parsed.get("design_state_data").get("algorithms").get("gbtregressor")
        gbr = GradientBoostingRegressor(n_estimators=gbtr_dict.get("num_of_BoostingStages") max_depth="max_depth")
        gbr.fit(df_train.iloc[:,:-1], df_train.iloc[:,-1])
        trained_models.append(gbr)
    if "ridgeregression" in list_of_models:
        rdg_dict = parsed.get("design_state_data").get("algorithms").get("ridgeregression")
        rdg = Ridge(max_iter=rdg_dict["max_iter"])
        rdg.fit(df_train.iloc[:,:-1], df_train.iloc[:,-1])
        trained_models.append(rdg)
    if "linearregression" in list_of_models:
        rdg_dict = parsed.get("design_state_data").get("algorithms").get("linearregression")
        lnreg = LinearRegression(n_jobs=n_jobs)
        lnreg.fit(df_train.iloc[:,:-1], df_train.iloc[:,-1])
        trained_models.append(lnreg)
    if "lassoregression" in list_of_models:
        laso_dict = parsed.get("design_state_data").get("algorithms").get("lassoregression")
        laso = Lasso(max_tier=laso_dict["max_iter"])
        laso.fit(df_train.iloc[:,:-1], df_train.iloc[:,-1])
        trained_models.append(laso)
    if "elasticnetregression" in list_of_models:
        enet_dict = parsed.get("design_state_data").get("algorithms").get("elasticnetregression")
        enet = ElasticNet(max_tier=enet_dict["max_tier"])
        enet.fit(df_train.iloc[:,:-1], df_train.iloc[:,-1])
        trained_models.append(enet)
    if "decisiontreeregressor" in list_of_models:
        dtr_dict = parsed.get("design_state_data").get("algorithms").get("decisiontreeregressor")
        dtr = DecisionTreeRegressor(max_depth=dtr_dict["max_depth"])
        dtr.fit(df_train.iloc[:,:-1], df_train.iloc[:,-1])
        trained_models.append(dtr)
    if "svm" in list_of_models:
        svr_dict = parsed.get("design_state_data").get("algorithms").get("svm")
        svr = SVR(kernel="linear_kernel")
        svr = DecisionTreeRegressor(max_depth=svr_dict["max_depth"])
        dtr.fit(df_train.iloc[:,:-1], df_train.iloc[:,-1])
        trained_models.append(dtr)
    if "sgd" in list_of_models:
        sgdreg_dict = parsed.get("design_state_data").get("algorithms").get("sgd")
        sgdreg = SGDRegressor(alpha=sgdreg_dict["alpha"])
        sgdreg.fit(df_train.iloc[:,:-1], df_train.iloc[:,-1])
        trained_models.append(sgdreg)
    if "extra_random_trees" in list_of_models:
        etreg_dict = parsed.get("design_state_data").get("algorithms").get("extra_random_trees")
        etreg = ExtraTreesRegressor(n_estimators=etreg_dict["num_of_trees"])
        etreg.fit(df_train.iloc[:,:-1], df_train.iloc[:,-1])
        trained_models.append(etreg)

    return trained_models