# Screening Test: Dendrite.ai AI/ML startup
As part of the screening test, you will write code to parse the JSON file provided(algoparams_from_ui) and kick off in sequence the following machine learning steps programmatically. Keep in mind your final code should be able to parse any Json that follows this format. It is crucial you have a generic parse that can read the various steps like feature handling, feature generation and model building using Grid search after parsing hyper params.


### Issue 1
It seems like the JSON part is mixed with RTF formatting in "algoparams_from_ui.json.rtf"  file. 
This needs to be removed, however would the testing code provide  a clean JSON file or one similar to this? 

### Solution
1. "\par" seems to preceed text encoded with the correct JSON format :recognize and split at "\par" 
2. replace "\{" and "\}" strings with correct format 
3. ignore lines if they have "}{" character 

In [312]:
import json 
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [313]:
# JSON file loading and data parsing 
jsonFilePath = 'algoparams_from_ui.json.rtf'

def parseRTFJsonFile(jsonFilePath):
    JSONFILE = "{\n"
    with open(jsonFilePath)  as f:
        for line in f: 
            if line.startswith("\par"):
                data_comp = line.split("\par")[1]
                if "\{" in data_comp:
                    data_comp = data_comp.replace("\{" ,"{")
                elif "\}" in data_comp:
                    data_comp  = data_comp.replace("\}", "}")
                if not "}{" in data_comp:
                    JSONFILE += data_comp
    jsonData = json.loads(JSONFILE)
    return jsonData

jsonData = parseRTFJsonFile(jsonFilePath=jsonFilePath)
# reading in the csv file 
irisDf = pd.read_csv("iris.csv")

#### TASK 1 : Read the target and type of regression to be run

In [314]:
regData = jsonData['design_state_data']['target']
regType  = regData['type']
print("Prediction type: {}; Predict: {}".format(regType, regData['target']))

Prediction type: regression; Predict: petal_width


#### TASK 2 : Missing data processing for specific columns
From the task text description it is clear that only imputation needs to be specified  and rest of the processing can be ignored

In [315]:
availCols = irisDf.columns.tolist() 
processCols = jsonData['design_state_data']['feature_handling']
label_encoders = {}
for cols in processCols:
    if cols in availCols:
       if processCols[cols]['feature_variable_type'] == 'text':
            label_encoders[cols] = LabelEncoder()
            irisDf[cols] = label_encoders[cols].fit_transform(irisDf[cols])

       if "missing_values" in processCols[cols]['feature_details']:
        imputeData = processCols[cols]['feature_details']['missing_values']
        imputeType = processCols[cols]['feature_details']['impute_with']
        if imputeData == "Impute":
            if imputeType == 'Average of values':
                irisDf[cols] = irisDf[cols].fillna(irisDf[cols].mean())
                print(f"Impute data {imputeType} impute value {irisDf[cols].mean()}")
            else:
                irisDf[cols] = irisDf[cols].fillna(processCols[cols]['feature_details']['impute_value'])
                print(f"Impute data {imputeType} impute value {processCols[cols]['feature_details']['impute_value']}")

        
    else:
        print("The impute column is not found in the iris dataframe")

Impute data Average of values impute value 5.843333333333334
Impute data custom impute value -1
Impute data Average of values impute value 3.758666666666666
Impute data custom impute value -2


#### TASK 3 : Compute feature reduction based on input. 
There can be No Reduction, Corr with Target, Tree-based, PCA. 

In [316]:
featureRed =  jsonData['design_state_data']['feature_reduction']
reductionMethod  = featureRed['feature_reduction_method']

In [317]:
def performFeatureRedcution(method = reductionMethod, irisDf = irisDf, method_para = featureRed, predict =regData['target'] ):
     # Assuming 'data' is a pandas DataFrame containing your features and target
    X = irisDf.drop(columns=[predict])
    y = irisDf[predict]
    
    if method == 'No Reduction':
        return X, y
    
    elif method == 'Corr with Target':
        numToKeep = int(featureRed['num_of_features_to_keep'])
        # Select top k features based on ANOVA F-value between each feature and the target
        selector = SelectKBest(score_func=f_classif, k=numToKeep)
        selected_features = selector.fit_transform(X, y)
        print(f'Corr with target feature reduction; features to keep: {numToKeep}')
        return pd.DataFrame(selected_features, columns=X.columns[selector.get_support()]), y
    
    elif method == 'Tree-based':
        numToKeep = int(method_para['num_of_features_to_keep'])
        numTrees = int(method_para['num_of_trees'])
        depthTrees = int(method_para['depth_of_trees'])
        print(f'Random forest based feature reduction; features to keep: {numToKeep}, number of trees: {numTrees}, depth of tree: {depthTrees}')
    
        # Tree-based feature selection
        clf = RandomForestRegressor(max_depth=depthTrees, n_estimators=numTrees)
        clf.fit(X, y)
        
        # Threshold needs to be kept low to return sufficient values 
        feature_selector = SelectFromModel(clf, max_features= numToKeep, threshold=1e-5, prefit=True)
        selected_features = feature_selector.transform(X)
        return pd.DataFrame(selected_features, columns=X.columns[feature_selector.get_support()]), y
    
    elif method == 'PCA':
        # Principal Component Analysis (PCA) for dimensionality reduction
        numToKeep = int(featureRed['num_of_features_to_keep'])
        print(f'PCA feature reduction; features to keep: {numToKeep}')

        pca = PCA(n_components=numToKeep)
        reduced_features = pca.fit_transform(X)
        return pd.DataFrame(reduced_features, columns=['PCA1', 'PCA2', 'PCA3']), y

In [318]:
X, y = performFeatureRedcution()

Random forest based feature reduction; features to keep: 4, number of trees: 5, depth of tree: 6


#### TASK 4 : Create Models 

In [319]:
# import all the models 
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVC, SVR
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import GridSearchCV

In [320]:
predictionType = jsonData['design_state_data']['target']
print(f"Hyper parameters for Grid Search CV : {jsonData['design_state_data']['hyperparameters']}")
algorithms = jsonData['design_state_data']['algorithms']

regression_models = {
    'RandomForestRegressor': RandomForestRegressor,
    'GBTRegressor': GradientBoostingRegressor,
    'LinearRegression': LinearRegression,
    'RidgeRegression': Ridge,
    'LassoRegression': Lasso,
    'ElasticNetRegression': ElasticNet,
    'xg_boost': xgb.XGBRegressor,
    'DecisionTreeRegressor': DecisionTreeRegressor,
    'SVM': SVR,
    'extra_random_trees': ExtraTreesRegressor,
    'neural_network': MLPRegressor,
    'SGD': SGDRegressor,
    'KNN': KNeighborsRegressor
}

classification_models = {
    'RandomForestClassifier': RandomForestClassifier,
    'GBTClassifier': GradientBoostingClassifier,
    'LogisticRegression': LogisticRegression,
    'extra_random_trees': ExtraTreesClassifier,
    'neural_network': MLPClassifier,
    'KNN': KNeighborsClassifier,
    'SGD': SGDClassifier
}


Hyper parameters for Grid Search CV : {'stratergy': 'Grid Search', 'shuffle_grid': True, 'random_state': 1, 'max_iterations': 2, 'max_search_time': 3, 'parallelism': 5, 'cross_validation_stratergy': 'Time-based K-fold(with overlap)', 'num_of_folds': 6, 'split_ratio': 0, 'stratified': True}


### Issue 2
The hyperparameters are not provided in a list like system which is essential for GridSearchCV. The solution 
is to read data manually, even if there was automated approach, it will require a lot of conditionals and might not work

In [326]:
parameters = {
    "RandomForestClassifier": {
        "n_estimators": [10, 30],
        "max_features": ["auto", "sqrt", "log2"],
        "max_depth": [20, 30],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 10],
        "bootstrap": [True, False]
    },
    "RandomForestRegressor": {
        "n_estimators": [10, 20],
        "max_features": ["auto", "sqrt", "log2"],
        "max_depth": [20, 25],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 10],
        "bootstrap": [True, False]
    },
    "GBTClassifier": {
        "n_estimators": [67, 89],
        "learning_rate": [0.1],
        "subsample": [1, 2],
        "min_samples_split": [2],
        "min_samples_leaf": [1],
        "max_depth": [5, 7],
        "min_impurity_decrease": [0.0],
        "min_impurity_split": [None],
        "init": [None],
        "random_state": [None],
        "max_features": [None],
        "alpha": [0.9],
        "verbose": [0],
        "max_leaf_nodes": [None],
        "warm_start": [False],
        "presort": ["auto"],
        "validation_fraction": [0.1],
        "n_iter_no_change": [None],
        "tol": [1e-4]
    },
    "GBTRegressor": {
        "n_estimators": [67, 89],
        "learning_rate": [0.1],
        "subsample": [0.1, 0.2],
        "min_samples_split": [2],
        "min_samples_leaf": [1],
        "max_depth": [5, 7],
        "alpha": [0.9],
        "validation_fraction": [0.1],
    },
    "LinearRegression": {
        "fit_intercept": [True, False],
        "normalize": [True, False],
        "copy_X": [True, False]
    },
    "LogisticRegression": {
        "C": [0.001, 0.01, 0.1, 1, 10, 100],
        "max_iter": [100, 200, 300]
    },
    "RidgeRegression": {
        "alpha": [0.5, 0.8],
        "fit_intercept": [True, False],
        "normalize": [True, False],
        "copy_X": [True, False],
        "max_iter": [30, 50],
        "tol": [1e-3, 1e-4]
    },
    "LassoRegression": {
        "alpha": [0.5, 0.8],
        "fit_intercept": [True, False],
        "normalize": [True, False],
        "precompute": [True, False],
        "copy_X": [True, False],
        "max_iter": [30, 50],
        "tol": [1e-3, 1e-4],
        "warm_start": [True, False],
        "positive": [True, False],
        "selection": ['cyclic', 'random']
    },
    "ElasticNetRegression": {
        "alpha": [0.5, 0.8],
        "l1_ratio": [0.5, 0.8],
        "fit_intercept": [True, False],
        "normalize": [True, False],
        "precompute": [True, False],
        "copy_X": [True, False],
        "max_iter": [30, 50],
        "tol": [1e-3, 1e-4],
        "warm_start": [True, False],
        "positive": [True, False],
        "selection": ['cyclic', 'random']
    },
    "xg_boost": {
        "n_estimators": [56,89],
        "learning_rate": [0.1, 0.001],
        "booster": ['gbtree','dart'],
        "random_state": [0],
    },
    "DecisionTreeRegressor": {
        "criterion": ["mse", "friedman_mse", "mae"],
        "splitter": ["best", "random"],
        "max_depth": [4, 7],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [12, 6],
        "min_weight_fraction_leaf": [0.0],
        "max_features": [None, "auto", "sqrt", "log2"],
        "random_state": [None],
        "max_leaf_nodes": [None],
        "min_impurity_decrease": [0.0],
        "min_impurity_split": [None],
        "ccp_alpha": [0.0]
    },
    "DecisionTreeClassifier": {
        "criterion": ["gini", "entropy"],
        "splitter": ["best", "random"],
        "max_depth": [4, 7],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [12, 6],
        "min_weight_fraction_leaf": [0.0],
        "max_features": [None, "auto", "sqrt", "log2"],
        "random_state": [None],
        "max_leaf_nodes": [None],
        "min_impurity_decrease": [0.0],
        "min_impurity_split": [None],
        "ccp_alpha": [0.0],
        "class_weight": [None, "balanced"],
        "presort": [False, True]
    },
    "SVM": {
        "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
        "degree": [3],
        "gamma": ['scale', 'auto'],
        "coef0": [0.0],
        "shrinking": [True, False],
        "tol": [7],
        "max_iter": [7],
    },
    "SGD": {
        "penalty": ['l2', 'l1', 'elasticnet'],
        "alpha": [79, 56],
        "l1_ratio": [0.15, 0.25],
        "fit_intercept": [True, False],
        "max_iter": [1000],
        "tol": [56],
        "shuffle": [True, False],
        "verbose": [False],
        "epsilon": [0.1],
        "learning_rate": ['constant', 'optimal', 'invscaling', 'adaptive'],
        "power_t": [0.5],
        "early_stopping": [False],
        "validation_fraction": [0.1],
        "n_iter_no_change": [5],
        "warm_start": [False],
        "average": [False],
    },
    "KNN": {
        "n_neighbors": [78],
        "weights": ['uniform', 'distance'],
        "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
        "leaf_size": [30],
        "p": [2],
        "metric": ['minkowski', 'euclidean', 'manhattan', 'chebyshev'],
        "metric_params": [None],
        "n_jobs": [None]
    },
    "extra_random_trees": {
        "n_estimators": [45, 489],
        "criterion": ["mse", "mae"],
        "max_depth": [12, 45],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [78, 56],
        "min_weight_fraction_leaf": [0.0],
        "bootstrap": [True, False],
    },
    "neural_network": {
        "hidden_layer_sizes": [(67,), (89,)],
        "solver": [ 'sgd', 'adam'],
        "alpha": [0.0001],
        "batch_size": ['auto'],
        "learning_rate_init": [0.001],
        "power_t": [0.5],
        "max_iter": [200],
        "shuffle": [True],
        "random_state": [None],
        "tol": [1e-4],
        "verbose": [False],
        "warm_start": [False],
        "momentum": [0.9],
        "nesterovs_momentum": [True],
        "early_stopping": [False],
        "validation_fraction": [0.1],
        "beta_1": [0.9],
        "beta_2": [0.999],
        "epsilon": [1e-8],
        "n_iter_no_change": [10],
        "max_fun": [15000]
    }
}

In [327]:
# Choose the appropriate dictionary based on the predictionType
if regType == 'regression':
    models = regression_models
else:
    models = classification_models

# Instantiate the model based on algo
for algo in models:
    # Is it selected 
    if algorithms[algo]['is_selected']:
        model_class = models[algo]
        model = model_class()
        params = algorithms[algo]
        output_params = parameters[algo]

        grid_search = GridSearchCV(model, output_params, cv=5, scoring='neg_mean_squared_error')
        # Step 4: Fit the models
        grid_search.fit(X, y)
        # Step 5: Report results
        print("Best parameters found: ", grid_search.best_params_)
        print("Best estimator found: ", grid_search.best_estimator_)
        print("Best score found: ", grid_search.best_score_)

Best parameters found:  {'bootstrap': False, 'criterion': 'mse', 'max_depth': 45, 'min_samples_leaf': 56, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 45}
Best estimator found:  ExtraTreesRegressor(max_depth=45, min_samples_leaf=56, min_samples_split=5,
                    n_estimators=45)
Best score found:  -0.7146832235311743
