In [34]:
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import f1_score, auc, roc_auc_score, confusion_matrix
import logging

logger = logging.getLogger(__name__)

In [35]:
# Reading Json file.
# I have done some changes in the json file as per the screening test word file.
myfile=open('algoparam.json','r')
data=myfile.read()
#print(type(data),'\n',data)

In [36]:
# converting data variable to json object.
obj=json.loads(data)

In [37]:
#Extracting only dataset object to get the file name.
file_path=obj['design_state_data']['session_info']['dataset']
print(file_path)
nw_pd_file=pd.read_csv(obj['design_state_data']['session_info']['dataset'])

iris_modified.csv


In [38]:
# 1) reading the target and type of the problem.
def read_target_and_prdict_type(obj,nw_pd_file):
    print("**===========================================================================================================================================**")
    print("Step 1: Reading target and prediction type...")
    global target_col,predic_type
    predic_type=obj['design_state_data']['target']['prediction_type']
    target_col=obj['design_state_data']['target']['target']
    print('Prediction Type: ',predic_type,'\nTarget Column: ',target_col)
    print("**===========================================================================================================================================**")
    return predic_type,target_col
    

In [39]:
# Ensure my function is working or not
read_target_and_prdict_type(obj,nw_pd_file)

Step 1: Reading target and prediction type...
Prediction Type:  classification 
Target Column:  species


('classification', 'species')

In [40]:
# 2) Feature Handling and imputation based on JSON file instructions and parameter.
def feature_handle_and_imputation(obj,nw_pd_file):
    global pd_file
    print("**===========================================================================================================================================**")
    print("Step 2: Handling features and performing imputation...")
    # Fetching "feature_handling" values from jSON file.
    feature_one=obj['design_state_data']['feature_handling']

    # Iterating the Features that want to handle.
    for feature in feature_one:
            print(feature)
            logger.info(f"Processing feature: {feature}")

            # select only feature from the json so that we can access the inside elements from that particular feature 
            select_feature=obj['design_state_data']['feature_handling'][feature]

            # Fetch type of the data type of the column from Json file.
            a1=obj['design_state_data']['feature_handling'][feature]['feature_variable_type']
            #b1=obj['design_state_data']['feature_handling'][feature]['feature_details']['impute_with']
            #b1_with=obj['design_state_data']['feature_handling'][feature]['feature_details']['impute_value']
            
            # if it is Numerical, impute, and feature is selected  then apply imputation technique.
            if a1=='numerical' and select_feature['is_selected']==True and select_feature['feature_details']['missing_values'].lower()=='impute':
                    
                    # if Imputation technique is custom then it missing_values will get filled with custom value
                    if obj['design_state_data']['feature_handling'][feature]['feature_details']['impute_with']=='custom':
                            print('Custom: fill with',obj['design_state_data']['feature_handling'][feature]['feature_details']['impute_value'])
                            logger.info(f"Custom: Filling missing values in {feature} with {obj['design_state_data']['feature_handling'][feature]['feature_details']['impute_value']}")
                            nw_pd_file[feature].fillna(obj['design_state_data']['feature_handling'][feature]['feature_details']['impute_value'],inplace=True)
                            print(feature,'\n')
                
                    # if imputation technique is selected as a "Average of mean" then it will be filled with column_mean value 
                    elif obj['design_state_data']['feature_handling'][feature]['feature_details']['impute_with']=='Average of values':
                            nw_pd_file[feature].fillna(nw_pd_file[feature].mean(),inplace=True)
                            print('Average: \n',feature,'\n',nw_pd_file[feature].mean())
                            logger.info(f"Filling missing values in {feature} with the average value: {nw_pd_file[feature].mean()}")

            # if it is Textual then imputation techniques
            if a1=='text':
                    print('textual column',feature)
                    logger.info(f"Encoding textual feature: {feature}")
                    encoder=LabelEncoder()
                    nw_pd_file[feature]=encoder.fit_transform(nw_pd_file[feature])
                    print(nw_pd_file[feature])
                    logger.debug(f"Encoded values for {feature}: {nw_pd_file[feature].head()}")
    pd_file=nw_pd_file


    print("**===========================================================================================================================================**")


    return nw_pd_file

In [41]:
feature_handle_and_imputation(obj,nw_pd_file)

Step 2: Handling features and performing imputation...
sepal_length
Average: 
 sepal_length 
 5.843333333333334
sepal_width
Custom: fill with 0
sepal_width 

petal_length
Average: 
 petal_length 
 3.758666666666666
petal_width
species
textual column species
0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: species, Length: 150, dtype: int32


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [42]:
# 3) splitting the dataset into train_test_split before applying feature reduction technique to avoid data lekage problem.
def split_dataset_train_test(pd_file,target_col):
    print("**===========================================================================================================================================**")
    print("Step 3: Splitting dataset into train and test...")
    global nw_X_train,nw_X_test,Y_train,Y_test
    # Setting the independent(x) and dependent(y) features.
    x=pd_file.drop(labels=target_col,axis=1)
    y=pd_file[target_col]
    
    # Fetching training ratio from the json file.
    param_of_split=obj['design_state_data']['train']
    nw_X_train, nw_X_test, Y_train, Y_test=train_test_split(x,y,train_size=param_of_split['train_ratio'],random_state=param_of_split['random_seed'])
    print("**===========================================================================================================================================**")

    return nw_X_train,nw_X_test,Y_train,Y_test

In [43]:
split_dataset_train_test(pd_file,target_col)

Step 3: Splitting dataset into train and test...


(     sepal_length  sepal_width  petal_length  petal_width
 81            5.5          2.4           3.7          1.0
 133           6.3          2.8           5.1          1.5
 137           6.4          3.1           5.5          1.8
 75            6.6          3.0           4.4          1.4
 109           7.2          3.6           6.1          2.5
 ..            ...          ...           ...          ...
 71            6.1          2.8           4.0          1.3
 106           4.9          2.5           4.5          1.7
 14            5.8          4.0           1.2          0.2
 92            5.8          2.6           4.0          1.2
 102           7.1          3.0           5.9          2.1
 
 [105 rows x 4 columns],
      sepal_length  sepal_width  petal_length  petal_width
 73            6.1          2.8           4.7          1.2
 18            5.7          3.8           1.7          0.3
 118           7.7          2.6           6.9          2.3
 78            6.0          2

In [44]:
# 4) Feature Reduction Technique
 
def feature_reduction(nw_X_train,nw_X_test):
    print("**===========================================================================================================================================**")
    print("Step 4: Performing feature reduction...")
    global X_train,X_test
    feature_red=obj['design_state_data']['feature_reduction']

    for feature_technique in feature_red:
        #print(feature_technique)
        if feature_technique=='No Reduction' and obj['design_state_data']['feature_reduction'][feature_technique]['is_selected']==True:
            print('No Reduction')
            pass

        elif feature_technique=='Correlation with target'and obj['design_state_data']['feature_reduction'][feature_technique]['is_selected']==True:
            print('Corelation with target Reduction')
            #correaltion with entire dataset
            target_corr=pd_file.corr()
            # correlation of target variable with all features.
            a=target_corr[target_col]
            #converting the above matrix into DataFrame (df).
            df=pd.DataFrame({'features':a.index,'score':abs(a.values)})
            #Dropping the target column because the correlation with itself is 1.
            df=df.drop(df[df['features']==target_col].index)
            print(df)
            # settig threshold as 0.8 this parameter is missing in the json file that's why for demo i am setting it as a 0.8.
            threshold=0.8
            #filtering features based on the threshold.
            op=df[df['score']>threshold]
            # as per the Json file i am keeping only "no_of_feature" who has high corelation parameters.  
            if len(df['features'])>feature_red['Correlation with target']['num_of_features_to_keep']:
                # sorthing in the Descending order 
                sort=df.sort_values('score',ascending=False)
                #keeping no of feature based on the json paarameter. 
                imp_features=sort.head(feature_red['Correlation with target']['num_of_features_to_keep'])
            # extracting only feature column from imp_features dataset.
            column=imp_features['features']
            # Making the list of column names only.
            lst_feature=list(column)
            print(lst_feature)
            # as per the feature reduction i am  Going to keep only those features in the training and testing dataset.            
            X_train =nw_X_train [lst_feature]
            X_test = nw_X_test[lst_feature]
            print(X_train)


        elif feature_technique=='Tree-based'and obj['design_state_data']['feature_reduction'][feature_technique]['is_selected']==True:
            print('Tree-Based Reduction')
            

        elif feature_technique=='Principal Component Analysis'and obj['design_state_data']['feature_reduction'][feature_technique]['is_selected']==True:
            print('Principal Component Analysis')
            X_train =StandardScaler().fit_transform(nw_X_train)
            X_test =StandardScaler().fit_transform(nw_X_test)
            pca=PCA(n_components=feature_red[feature_technique]['num_of_features_to_keep'])
            X_train=pca.fit_transform(nw_X_train)
            X_test=pca.transform(nw_X_test)
            print(X_train)
    print("**===========================================================================================================================================**")

    return X_train,X_test
        
        

In [45]:
feature_reduction(nw_X_train,nw_X_test)

Step 4: Performing feature reduction...
Principal Component Analysis
[[-0.32301053 -0.59992892]
 [ 1.32605926 -0.11071167]
 [ 1.79434397  0.05895252]
 [ 0.79158304  0.38070629]
 [ 2.82465587  0.75726296]
 [ 0.25609611 -0.24129577]
 [ 3.29414131  0.5402165 ]
 [ 0.54073392 -0.31709254]
 [-2.78888112  0.45244159]
 [ 3.3931068   0.45540496]
 [ 0.1127914  -0.26589449]
 [-2.74574017  0.44819639]
 [-2.87596155  0.39522425]
 [-2.30791281  0.54578463]
 [-0.65624967 -1.17817458]
 [ 1.2678643  -0.17289955]
 [-2.46699107  0.08197364]
 [-2.62534981 -0.00905817]
 [-2.41359078  0.22009348]
 [ 0.23078273 -0.45009389]
 [-2.69743004  0.36159799]
 [ 0.21452705 -0.16000407]
 [ 3.40493465  1.14499487]
 [-2.95704566 -0.09444794]
 [ 0.52759678  0.07272129]
 [ 1.21332045 -0.75592755]
 [-2.68168509  1.21661965]
 [ 1.05293291 -0.1403348 ]
 [ 1.28851584 -0.55668133]
 [ 0.1313923  -0.46759906]
 [ 0.46633708 -0.45063102]
 [ 1.79474371  0.10690025]
 [ 0.04633729 -0.76126259]
 [-2.37327719  0.85676511]
 [-0.84509435

(array([[-0.32301053, -0.59992892],
        [ 1.32605926, -0.11071167],
        [ 1.79434397,  0.05895252],
        [ 0.79158304,  0.38070629],
        [ 2.82465587,  0.75726296],
        [ 0.25609611, -0.24129577],
        [ 3.29414131,  0.5402165 ],
        [ 0.54073392, -0.31709254],
        [-2.78888112,  0.45244159],
        [ 3.3931068 ,  0.45540496],
        [ 0.1127914 , -0.26589449],
        [-2.74574017,  0.44819639],
        [-2.87596155,  0.39522425],
        [-2.30791281,  0.54578463],
        [-0.65624967, -1.17817458],
        [ 1.2678643 , -0.17289955],
        [-2.46699107,  0.08197364],
        [-2.62534981, -0.00905817],
        [-2.41359078,  0.22009348],
        [ 0.23078273, -0.45009389],
        [-2.69743004,  0.36159799],
        [ 0.21452705, -0.16000407],
        [ 3.40493465,  1.14499487],
        [-2.95704566, -0.09444794],
        [ 0.52759678,  0.07272129],
        [ 1.21332045, -0.75592755],
        [-2.68168509,  1.21661965],
        [ 1.05293291, -0.140

In [46]:
# 5) GridSearchcv: 
#    1) identify selected model. 
#    2) pass all the parameters in grid search function. 
#    3) fit and check the score

In [60]:
# 5.1 Identify selected model 
def model_gridcv_score(pd_file,X_train,X_test,Y_train,Y_test):
    print("**===========================================================================================================================================**")
    print("Step 5: Hyperparameter tuning using GridSearchCV...")
    # fetching all the listed algorithms listed in json file
    algo_find=obj['design_state_data']['algorithms']
    #Specified prediction type(Regresssion or classification) of each algorithm.
    classification=['RandomForestClassifier','GBTClassifier','LogisticRegression','xg_boost','DecisionTreeClassifier','SVM']
    regression=['RandomForestRegressor','GBTRegressor','LinearRegression','RidgeRegression','LassoRegression','ElasticNetRegression','DecisionTreeRegressor']

    # Iterate all the algorithms 
    for algo in algo_find:
        # Found algorithm If it is Regresion or classification and as per the pred_type it is not correct, and not listed in the regression list then code will get terminated. 
        if predic_type.lower()=='regression' and algo_find[algo]['is_selected']==True and algo not in regression or algo_find[algo]['is_selected']==True and algo not in classification and predic_type.lower()=='classification':
            print('Check your Prediction_Type:"',predic_type,'".\ncheck selected Model: ',algo,'\n\nThis is not a Valid selection.')
            break
        #Found algorithm If it is Regresion and listed in regression list then it will meet to the condition 
        elif algo_find[algo]['is_selected']==True and algo in regression and predic_type.lower()=='regression':
            print(algo)
            # For now testing Purpose i am considering only randomforestregression model but we can replicate this with all algorithms.
            #selected_algorithm is RandomForestRegressor as per the JSON file.
            if algo=='RandomForestRegressor':
                model=RandomForestRegressor()
                # RandomForestRegressor Model Hyperparameters which will pass to the GridSearchCV to find the best paarameters and model score.
                grid_parameters={
                    'n_estimators':[(algo_find[algo]['min_trees']+algo_find[algo]['max_trees'])//2],  # Average of min_trees and max_trees
                    'max_features':[algo_find[algo]['feature_sampling_statergy']],  # Corresponds to the "Default" strategy
                    'max_depth': [algo_find[algo]['max_depth']],  # max_depth
                    'min_samples_leaf':[(algo_find[algo]['min_samples_per_leaf_min_value']+algo_find[algo]['min_samples_per_leaf_max_value'])//2]  # Average of min_samples_per_leaf_min_value and max_value
                    #'n_jobs':None#[algo_find[algo]['parallelism']]  # Parallelism
                }
                
        #Found algorithm If it is classification and listed in classification list then it will meet to the condition.
        elif algo_find[algo]['is_selected']==True and algo in classification and predic_type.lower()=='classification':
            print(algo)
            # For now testing Purpose i am considering only classification model but we can replicate this with all algorithms.
            #selected_algorithm is RandomForestRegressor as per the JSON file.
            if algo=='RandomForestClassifier':
                model=RandomForestClassifier()
                # RandomForestClassifier Model Hyperparameters which will pass to the GridSearchCV to find the best paarameters and model score.
                grid_parameters={
                    'n_estimators':[(algo_find[algo]['min_trees']+algo_find[algo]['max_trees'])//2],
                    'max_features':[algo_find[algo]['feature_sampling_statergy']],
                    'max_depth':[algo_find[algo]['max_depth']],
                    'min_samples_leaf':[(algo_find[algo]['min_samples_per_leaf_min_value']+algo_find[algo]['min_samples_per_leaf_max_value'])//2]
                }
                
        
        
                
                
    print('Model:',model)
    print('grid_parameters:',grid_parameters) 
    # Extracted Gridserachcv hyperparameters.           
    hyperparameter_grid=obj['design_state_data']['hyperparameters']
    # if Choosen Strategy is GridSerachCv then only code will execute the further process.
    if hyperparameter_grid['stratergy']=='Grid Search':
        if hyperparameter_grid['Time-based K-fold(with overlap)']['is_selected']==True and predic_type.lower()=='regression':
            grid=GridSearchCV(model,param_grid=grid_parameters,n_jobs=hyperparameter_grid['parallelism'],cv=hyperparameter_grid['Time-based K-fold(with overlap)']['num_of_folds'])
            grid.fit(X_train,Y_train)
            print('Best Etimator:',grid.best_estimator_,'\n Best Parameter:',grid.best_params_,'\n Score:',grid.best_score_)
        elif hyperparameter_grid['Time-based K-fold(with overlap)']['is_selected']==True and predic_type.lower()=='classification':
            grid=GridSearchCV(model,param_grid=grid_parameters,n_jobs=hyperparameter_grid['parallelism'],cv=hyperparameter_grid['Time-based K-fold(with overlap)']['num_of_folds'])
            grid.fit(X_train,Y_train)
            print('Best Etimator:',grid.best_estimator_,'\n Best Parameter:',grid.best_params_,'\n Score:',grid.best_score_)
            score=obj['design_state_data']['metrics']
            if score['optomize_model_hyperparameters_for'].lower()=='auc' or score['optimize_threshold_for'].lower()==['f1score','f1-score','f_score']:
                estimator=grid.best_estimator_
                y_pred=estimator.predict(X_test)
                # calculate f1-score   
                f1score=f1_score(Y_test,y_pred,average='weighted')
                print('F1-Score',f1score)
                #if it's binary classification then only i will apply confusion matri otherwise auc_roc_curve
                if len(pd_file[target_col].unique())<=2:
                    # Cost Matrix fetching values from the json.
                    tn, fp, fn, tp = confusion_matrix(Y_test, y_pred).ravel()
                    total_gain = (tp * score["gain_for_true_positive"] +
                    tn * score["gain_for_true_negative"] +
                    fp * score["gain_for_false_positive"] +
                    fn * score["gain_for_false_negative"])
                    print('\nConfusion-Matrix: ','\nTrue_positive:',tp,'\nTrue-Negtive:',tn,'\nFalse-Positive:',fp,'\nFalse-Nagative:',fn)
                    print('\nTotal Gain: ',total_gain)

        elif hyperparameter_grid['Time-based K-fold(with overlap)']['is_selected']==False:
            grid=GridSearchCV(model,param_grid=grid_parameters,n_jobs=hyperparameter_grid['parallelism'])
            grid.fit(X_train,Y_train)
            print('Best Etimator:',grid.best_estimator_,'\n Best Parameter:',grid.best_params_,'\n Score:',grid.best_score_)
    print("**===========================================================================================================================================**")



In [61]:
model_gridcv_score(pd_file,X_train,X_test,Y_train,Y_test)

Step 5: Hyperparameter tuning using GridSearchCV...
RandomForestClassifier
Model: RandomForestClassifier()
grid_parameters: {'n_estimators': [20], 'max_features': ['auto'], 'max_depth': [30], 'min_samples_leaf': [27]}


  warn(


Best Etimator: RandomForestClassifier(max_depth=30, max_features='auto', min_samples_leaf=27,
                       n_estimators=20) 
 Best Parameter: {'max_depth': 30, 'max_features': 'auto', 'min_samples_leaf': 27, 'n_estimators': 20} 
 Score: 0.7881263616557734
F1-Score 0.6442706636255023


In [62]:
# 6) Creating Pipline to streamline the workflow.

pipline=Pipeline(steps=[
    ('Target_and_problem_type',read_target_and_prdict_type(obj,nw_pd_file)),
    ('Feature_Handling_and_Imputation',feature_handle_and_imputation(obj,nw_pd_file)),
    ('Splitting_of_dataset',split_dataset_train_test(pd_file,target_col)),
    ('Feature_Reduction',feature_reduction(nw_X_train,nw_X_test)),
    ('Hyperparameter_tunning_using_gridsearchcv',model_gridcv_score(pd_file,X_train,X_test,Y_train,Y_test))
])

Step 1: Reading target and prediction type...
Prediction Type:  classification 
Target Column:  species
Step 2: Handling features and performing imputation...
sepal_length
Average: 
 sepal_length 
 5.843333333333334
sepal_width
Custom: fill with 0
sepal_width 

petal_length
Average: 
 petal_length 
 3.758666666666666
petal_width
species
textual column species
0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: species, Length: 150, dtype: int64
Step 3: Splitting dataset into train and test...
Step 4: Performing feature reduction...
Principal Component Analysis
[[-0.32301053 -0.59992892]
 [ 1.32605926 -0.11071167]
 [ 1.79434397  0.05895252]
 [ 0.79158304  0.38070629]
 [ 2.82465587  0.75726296]
 [ 0.25609611 -0.24129577]
 [ 3.29414131  0.5402165 ]
 [ 0.54073392 -0.31709254]
 [-2.78888112  0.45244159]
 [ 3.3931068   0.45540496]
 [ 0.1127914  -0.26589449]
 [-2.74574017  0.44819639]
 [-2.87596155  0.39522425]
 [-2.30791281  0.54578463]
 [

  warn(
