In [13]:
def automated_modelling(input_file):
    
    # importing required Libraries
    
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import MinMaxScaler, StandardScaler
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import LabelEncoder
    
    from striprtf.striprtf import rtf_to_text
    from sklearn.model_selection import train_test_split as tts
    from sklearn.impute import SimpleImputer

    import json
    
    # extract the text from rtf file and parse it as a JSON
    with open(input_file, 'r') as f:
        rtf_content = f.read()
        plain_text = rtf_to_text(rtf_content)
        dict = json.loads(plain_text)
        # this result in a dictionary 
    
    # read the csv as DataFrame
    df = pd.read_csv(dict['design_state_data']['session_info']['dataset'])
    
    # target columns detail
    problem_type = dict['design_state_data']['target']['prediction_type']
    target_col = dict['design_state_data']['target']['target']
    
    # extract the columns name that is selected true
    org_df_col = df.columns
    is_false_col = []
    
    # looping into cols of DataFrame
    for col in org_df_col:
        if not dict['design_state_data']['feature_handling'][col]['is_selected']:
            is_false_col.append(col)
            
    # drop not_selected_col:
    df.drop(columns = is_false_col, inplace = True)
    
    # split the data frame into x and y
    x = df.drop(columns = target_col).copy()
    y = df[target_col].copy()
    
    # numeric columns and categorical columns in x
    num_col_x = x.select_dtypes(exclude = 'O').columns
    cat_col_x = x.select_dtypes(include = 'O').columns
    
    
    # empty list for missing value imputation with method
    avg_col = []
    median_col = []
    mode_col = []
    std_scaling_col = []
    min_max_scaling_col =[]
    
    for col in num_col_x:      
        for key in dict['design_state_data']['feature_handling'][col]['feature_details']:
            
            if key == 'rescaling' and dict['design_state_data']['feature_handling'][col]['feature_details'][key] == 'MinMaxScaler':
                min_max_scaling_col.append(col)
                
            elif key == 'rescaling' and dict['design_state_data']['feature_handling'][col]['feature_details'][key] == 'StandardScaler':
                std_scaling_col.append(col)
            
            elif key == 'missing_values' and not dict['design_state_data']['feature_handling'][col]['feature_details'][key]:
                continue
            # if missing_values is false then loops
            
            elif key == 'impute_with' and dict['design_state_data']['feature_handling'][col]['feature_details'][key] == 'Average of values':
                avg_col.append(col)
                
            elif key == 'impute_with' and dict['design_state_data']['feature_handling'][col]['feature_details'][key] == 'Median of values':
                median_col.append(col)
                
            elif key == 'impute_with' and dict['design_state_data']['feature_handling'][col]['feature_details'][key] == 'Mode of values':
                mode_col.append(col)
            
            
    # performing Train_test_split
    random_state = dict['design_state_data']['train']['random_seed']
    train_size = dict['design_state_data']['train']['train_ratio']
    
    x_train, x_test, y_train, y_test = tts(x,y, train_size = train_size, random_state = random_state)
    
    # define object of simpleimputer
    si_mean = SimpleImputer(strategy = 'mean')
    si_median = SimpleImputer(strategy = 'median')
    si_mode = SimpleImputer(strategy = 'most_frequent')
    
    # impute missing values 
    
    if len(avg_col) > 0:
        x_train[avg_col] = si_mean.fit_transform(x_train[avg_col])
        x_test[avg_col]  = si_mean.transform(x_test[avg_col])
    
    if len(median_col) > 0:
        x_train[median_col] = si_median.fit_transform(x_train[median_col])
        x_test[median_col]  = si_median.transform(x_test[median_col])
        
    if len(mode_col) > 0:
        x_train[mode_col] = si_mode.fit_transform(x_train[mode_col])
        x_test[mode_col]  = si_mode.transform(x_test[mode_col])
        
   
    # defining scaling methods i.e, minmax scaler and standard scaler 
    min_max_scaler = MinMaxScaler()
    std_scaler = StandardScaler()
    
    # rescaling the features 
    if len(std_scaling_col) > 0:
        x_train[std_scaling_col] = std_scaler.fit_transform(x_train[std_scaling_col])
        x_test[std_scaling_col]  = std_scaler.transform(x_test[std_scaling_col])
        
    if len(min_max_scaling_col) > 0:
        x_train[min_max_scaling_col] = min_max_scaler.fit_transform(x_train[min_max_scaling_col])
        x_test[min_max_scaling_col]  = min_max_scaler.transform(x_test[min_max_scaling_col])
        
    
    # define the variable for onehotencoder
    ohe = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')
    
    # handel categorical columns if have any
    if len(cat_col_x) > 0:
        
        # encoding cat col and add in data set 
        x_train[ohe.get_feature_names_out()] = ohe.fit_transform(x_train[cat_col_x])
        x_test[ohe.get_feature_names_out()] = ohe.transform(x_test[cat_col_x])
        
        # drop cat col after encoding it 
        x_train.drop(columns = cat_col_x, inplace = True)
        x_test.drop(columns = cat_col_x, inplace = True)
        
    # define the object of label encoder
    label_enc = LabelEncoder()
    
    # if y or target column is a categorical columns then using label encoder for values
    if problem_type == 'Classification' and y.dtype == 'O':
        y_train = label_enc.fit_transform(y_train)
        y_test = label_enc.transform(y_test)
    
    
   # found the algoritham that will use to train the moel  
    algorithm_df = pd.DataFrame({'Regression':['RandomForestRegressor','DecisionTreeRegressor'],
                           'Classification':['RandomForestClassifier','DecisionTreeClassifier']})
    
    algorithms_list = []
    
    for algo in algorithm_df[problem_type]:
        if dict['design_state_data']['algorithms'][algo]['is_selected']:
            algorithms_list.append(algo)
        
    
    # define result df for regression model
    result_df = pd.DataFrame({'algoritham':[],
                              'r2_train':[],
                              'r2_test':[],
                              'adj_r2_train':[],
                              'adj_r2_test':[],
                              'rmse_train':[],
                              'rmse_test':[]})
    
    # train test row and columns
    train_n, train_k = x_train.shape
    test_n, test_k   = x_test.shape
    # for adjusted r2 calculation 
    
    # RandomForestRegressor Model
    if 'RandomForestRegressor' in algorithms_list:
        
        from sklearn.ensemble import RandomForestRegressor
        from sklearn.model_selection import GridSearchCV
        from sklearn.metrics import r2_score, root_mean_squared_error
    
        # base model define 
        base_model_rfr = RandomForestRegressor()
        
        # parmeters values list 
        n_estimators_list =  pd.Series(np.linspace(dict['design_state_data']['algorithms']['RandomForestRegressor']['min_trees'],
                                      dict['design_state_data']['algorithms']['RandomForestRegressor']['max_trees'],4)).astype('int').values
        
        max_depth_list =  pd.Series(np.linspace(dict['design_state_data']['algorithms']['RandomForestRegressor']['min_depth'],
                                      dict['design_state_data']['algorithms']['RandomForestRegressor']['max_depth'],4)).astype('int').values
        
        
        min_samples_leaf_list =  pd.Series(np.linspace(dict['design_state_data']['algorithms']['RandomForestRegressor']['min_samples_per_leaf_min_value'],
                                      dict['design_state_data']['algorithms']['RandomForestRegressor']['min_samples_per_leaf_max_value'],4)).astype('int').values
        # parameters dict
        parameters_rfr = {'n_estimators':n_estimators_list,
                     'max_depth':max_depth_list,
                     'min_samples_leaf':min_samples_leaf_list}
        
        # object of GridSearchCV
        
        grid_search_cv = GridSearchCV(base_model_rfr,parameters_rfr, scoring = 'neg_root_mean_squared_error')
        grid_search_cv.fit(x_train,y_train)
        best_parm = grid_search_cv.best_params_
        
        # final modelling using randomforestregressor
        model_rfr_f = RandomForestRegressor(n_estimators = best_parm['n_estimators'], max_depth = best_parm['max_depth'],
                                           min_samples_leaf = best_parm['min_samples_leaf'])
        
        # model train
        model_rfr_f.fit(x_train,y_train)
        
        r2_train = r2_score(y_train, model_rfr_f.predict(x_train))
        r2_test = r2_score(y_test, model_rfr_f.predict(x_test))
        
        rmse_train = root_mean_squared_error(y_train, model_rfr_f.predict(x_train))
        rmse_test = root_mean_squared_error(y_test, model_rfr_f.predict(x_test))
        
        adj_r2_train = 1 - (((1-r2_train) *( train_n-1))/(train_n-train_k-1))
        adj_r2_test = 1 - (((1-r2_test) *( test_n-1))/(test_n-test_k-1))
        
        temp_df = pd.DataFrame({'algoritham':['RandomForestRegressor'],
                                'r2_train':[r2_train],
                                'r2_test':[r2_test],
                                'adj_r2_train':[adj_r2_train],
                                'adj_r2_test':[adj_r2_test],
                                'rmse_train':[rmse_train],
                                'rmse_test':[rmse_test]})
        
        # concat the temp_df with result_df
        result_df = pd.concat([result_df, temp_df])
        
        
     # RandomClassifier Model
    
    if 'RandomForestClassifier' in algorithms_list:
        
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.model_selection import GridSearchCV
        from sklearn.metrics import classification_report
    
        # base model define 
        base_model_rfc = RandomForestClassifier()
        
        # parmeters values list 
        n_estimators_list = pd.Series(np.linspace(dict['design_state_data']['algorithms']['RandomForestClassifier']['min_trees'],
                                      dict['design_state_data']['algorithms']['RandomForestClassifier']['max_trees'],4)).astype('int').values
        
        max_depth_list = pd.Series(np.linspace(dict['design_state_data']['algorithms']['RandomForestClassifier']['min_depth'],
                                      dict['design_state_data']['algorithms']['RandomForestClassifier']['max_depth'],4)).astype('int').values
        
        min_samples_leaf_list = pd.Series(np.linspace(dict['design_state_data']['algorithms']['RandomForestClassifier']['min_samples_per_leaf_min_value'],
                                      dict['design_state_data']['algorithms']['RandomForestClassifier']['min_samples_per_leaf_max_value'],4)).astype('int').values
        # parameters dict
        parameters_rfc = {'n_estimators':n_estimators_list,
                     'max_depth':max_depth_list,
                     'min_samples_leaf':min_samples_leaf_list}
        
        
        # object of GridSearchCV
        clf_c = GridSearchCV(base_model_rfc,parameters_rfc, scoring = 'f1_macro')
        clf_c.fit(x_train,y_train)
        best_parm = clf_c.best_params_
        
        # final model
        model_rfc_f = RandomForestClassifier(n_estimators = best_parm['n_estimators'], max_depth = best_parm['max_depth'],
                                           min_samples_leaf = best_parm['min_samples_leaf'])
        
        # model train
        model_rfc_f.fit(x_train,y_train)
        
        y_pred_train = model_rfc_f.predict(x_train)
        y_pred_test = model_rfc_f.predict(x_test)
        
        print('Randomforest_classifier \n----------------------------------')
        print(f"Train_report:-- \n",classification_report(y_train,y_pred_train))
        print("")
        print(f"Test_report:-- \n",classification_report(y_test,y_pred_test))
        
    
    # decision tree regressior
    if 'DecisionTreeRegressor' in algorithms_list:
        
        from sklearn.tree import DecisionTreeRegressor
        from sklearn.model_selection import GridSearchCV
        from sklearn.metrics import r2_score, root_mean_squared_error
        
        # base model define
        base_model_dtr = DecisionTreeRegressor()
        
        # parmeters values list
        min_samples_per_leaf_list = dict['design_state_data']['algorithms']['DecisionTreeRegressor']['min_samples_per_leaf']
        
        max_depth_list =  pd.Series(np.linspace(dict['design_state_data']['algorithms']['DecisionTreeRegressor']['min_depth'],
                                      dict['design_state_data']['algorithms']['DecisionTreeRegressor']['max_depth'],4)).astype('int').values 
        
        
        splitter_list = []
        
        if dict['design_state_data']['algorithms']['DecisionTreeRegressor']['use_best']:
            splitter_list.append('best')
            
        
        if dict['design_state_data']['algorithms']['DecisionTreeRegressor']['use_random']:
            splitter_list.append('random')
        
        # parameters dict
        parameters_dtr = {'splitter':splitter_list,
                         'max_depth':max_depth_list,
                         'min_samples_leaf':min_samples_per_leaf_list}
        
        # object of GridSearchCV
        clf_dtr = GridSearchCV(base_model_dtr,parameters_dtr, scoring = 'neg_root_mean_squared_error')
        clf_dtr.fit(x_train,y_train)
        best_parm_dtr = clf_dtr.best_params_
        
        # final model
        model_dtr_final = DecisionTreeRegressor(max_depth = best_parm_dtr['max_depth'],
                                           min_samples_leaf = best_parm_dtr['min_samples_leaf'],splitter = best_parm_dtr['splitter'])
        
        # model train
        model_dtr_final.fit(x_train,y_train)
        
        r2_train = r2_score(y_train, model_dtr_final.predict(x_train))
        r2_test = r2_score(y_test, model_dtr_final.predict(x_test))
        
        rmse_train = root_mean_squared_error(y_train, model_dtr_final.predict(x_train))
        rmse_test = root_mean_squared_error(y_test, model_dtr_final.predict(x_test))
        
        adj_r2_train = 1 - (((1-r2_train) *( train_n-1))/(train_n-train_k-1))
        adj_r2_test = 1 - (((1-r2_test) *( test_n-1))/(test_n-test_k-1))
        
        temp_df = pd.DataFrame({'algoritham':['DecisionTreeRegressor'],
                                'r2_train':[r2_train],
                                'r2_test':[r2_test],
                                'adj_r2_train':[adj_r2_train],
                                'adj_r2_test':[adj_r2_test],
                                'rmse_train':[rmse_train],
                                'rmse_test':[rmse_test]})
        
        
        # concat the temp_df with result_df
        result_df = pd.concat([result_df, temp_df])
    
    # decision tree classsifier
    if 'DecisionTreeClassifier' in algorithms_list:
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.model_selection import GridSearchCV
        from sklearn.metrics import classification_report
        
        # base model define 
        base_model_dc = DecisionTreeClassifier()
        
        # parmeters values list
        criteria_list = []
        
        if dict['design_state_data']['algorithms']['DecisionTreeClassifier']['use_gini']:
            criteria_list.append('gini')
            
        if dict['design_state_data']['algorithms']['DecisionTreeClassifier']['use_entropy']:
            criteria_list.append('entropy')
        
        
        splitter_list = []
        
        if dict['design_state_data']['algorithms']['DecisionTreeClassifier']['use_best']:
            splitter_list.append('best')
            
        
        if dict['design_state_data']['algorithms']['DecisionTreeClassifier']['use_random']:
            splitter_list.append('random')     
        
        
        min_samples_per_leaf_list = dict['design_state_data']['algorithms']['DecisionTreeClassifier']['min_samples_per_leaf']
        
        max_depth_list =  pd.Series(np.linspace(dict['design_state_data']['algorithms']['DecisionTreeClassifier']['min_depth'],
                                      dict['design_state_data']['algorithms']['DecisionTreeClassifier']['max_depth'],4)).astype('int').values 
            
        # parameters dict
        parameters_dc = {'criterion':criteria_list
                         ,'splitter':splitter_list,
                         'max_depth':max_depth_list,
                         'min_samples_leaf':min_samples_per_leaf_list}
                     
        
        # object of GridSearchCV
        
        clf_dct = GridSearchCV(base_model_dc,parameters_dc, scoring = 'f1_macro')
        clf_dct.fit(x_train,y_train)
        best_parm_dc = clf_dct.best_params_
        
        
        # final model
        model_dct_f = DecisionTreeClassifier(criterion = best_parm_dc['criterion'], max_depth = best_parm_dc['max_depth'],
                                           min_samples_leaf = best_parm_dc['min_samples_leaf'],splitter = best_parm_dc['splitter'])
        
        # model train
        model_dct_f.fit(x_train,y_train)
        
        y_pred_train = model_dct_f.predict(x_train)
        y_pred_test = model_dct_f.predict(x_test)
        
        print('DecisionTreeClassifier \n----------------------------------')
        print(f"Train_report:-- ")
        print(classification_report(y_train,y_pred_train))
        print("")
        print(f"Test_report:--")
        print(classification_report(y_test,y_pred_test))

        
    # return result df if regression problem
    if problem_type == 'Regression':
        return result_df
    
                         
    
        

In [15]:
automated_modelling('algoparams_from_ui1.json.rtf')

Randomforest_classifier 
----------------------------------
Train_report:-- 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       0.97      0.89      0.93        37
           2       0.91      0.98      0.94        43

    accuracy                           0.96       120
   macro avg       0.96      0.96      0.96       120
weighted avg       0.96      0.96      0.96       120


Test_report:-- 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.85      0.92        13
           2       0.78      1.00      0.88         7

    accuracy                           0.93        30
   macro avg       0.93      0.95      0.93        30
weighted avg       0.95      0.93      0.93        30

DecisionTreeClassifier 
----------------------------------
Train_report:-- 
              precision    recall  f1-score   support

           