In [78]:
import json
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import  train_test_split
from sklearn.linear_model import LinearRegression,Lasso,LogisticRegression,SGDRegressor,Ridge
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier,GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [79]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,PolynomialFeatures
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,f_regression
import json

In [83]:
class Models:
  def __init__(self,model_info_path='/content/algoparams_from_ui.json',datapath="/content/iris.csv"):
    self.df=pd.read_csv(datapath)
    self.model_info=json.load(open(model_info_path))
  
  def feature_handling(self):
    # imputation transformer
    sepal_value=self.model_info['design_state_data']["feature_handling"]['sepal_width']['feature_details']['impute_value']
    petal_value=self.model_info['design_state_data']["feature_handling"]['petal_width']['feature_details']['impute_value']
    trf1 = ColumnTransformer([
        ('impute_sepal_length',SimpleImputer(),[0]),
        ('impute_sepal_width',SimpleImputer(strategy ='constant', fill_value=sepal_value),[1]),
        ('impute_petal_length',SimpleImputer(),[2]),
        ('impute_petal_width',SimpleImputer(strategy ='constant', fill_value=petal_value),[3])
    ],remainder='passthrough')

    # one hot encoding
    trf2 = ColumnTransformer([
        ('ohe_species',OneHotEncoder(sparse=False,handle_unknown='ignore'),[4])
    ],remainder='passthrough')

    pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ])
    d_pipe=pipe.fit_transform(self.df)
    return d_pipe

  def feature_generation(self):
    X=pd.DataFrame(zip(self.df['petal_length']/self.df['sepal_width'],self.df['petal_width']/len(self.df['species'].unique())))
    poly = PolynomialFeatures(interaction_only=True,include_bias = False)
    self.df[["new_feature_3","new_feature_4","new_feature_5"]]=poly.fit_transform(X)
    self.df['new_feature_1']=self.df["petal_length"]*self.df["sepal_width"]
    self.df['new_feature_2']=self.df["sepal_width"]/self.df["sepal_length"]
    self.df['new_feature_3']=self.df["petal_width"]/self.df["sepal_length"]
    return self.df

  def trans_cvrt(self,d_pipe):
      columns_all=['species_1','species_2','species_3','sepal_length','sepal_width','petal_length','petal_width','new_feature_3','new_feature_4','new_feature_5','new_feature_1','new_feature_2']
      df=pd.DataFrame(data=d_pipe,columns=columns_all)
      cnt_int=['species_1','species_2',	'species_3',]
      cnt_float=['sepal_length','sepal_width','petal_length','petal_width','new_feature_3','new_feature_4','new_feature_5','new_feature_1','new_feature_2']
      for column in columns_all:
        if column in cnt_int:
          df[column]=df[column].astype('int')
        elif column in cnt_float:
          df[column]=df[column].astype('float')
      
      #creating the dependent and independent frames
      x=df.drop([model_info['design_state_data']['target']['target']],axis=1)
      y=df[model_info['design_state_data']['target']['target']]
      return x,y

  def feature_reduction(self,x,y):
    #Select top  features based on mutual info regression
    selector = SelectKBest(f_regression, k=int(self.model_info['design_state_data']["feature_reduction"]['num_of_features_to_keep']))
    selector.fit(x, y)
    x_names=list(x.columns[selector.get_support()])
    x=x[x_names]
    return x



  def model_objects():
        rr=Ridge()
        logr=LogisticRegression()
        lr=LinearRegression()
        gbr_model=GradientBoostingRegressor()
        gbc_model=GradientBoostingClassifier()
        rfc_model=RandomForestClassifier()
        rfr_model=RandomForestRegressor()
        sgd=SGDRegressor()
        dtc=DecisionTreeClassifier()
        dtr=DecisionTreeRegressor()
        la=Lasso()
        return rr,logr,lr,gbr_model,gbc_model,rfc_model,rfr_model,sgd,dtc,dtr,la
  

  def splits(self,x,y):
    xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,shuffle=True)
    return xtrain,xtest,ytrain,ytest

  def execution(self):
    self.df=Models.feature_generation(self)
    d_pipe=Models.feature_handling(self)
    x,y=Models.trans_cvrt(self,d_pipe)
    x=Models.feature_reduction(self,x,y)
    xtrain,xtest,ytrain,ytest=Models.splits(self,x,y)
    return xtrain,xtest,ytrain,ytest

  def model_prediction(self):
    xtrain,xtest,ytrain,ytest=Models.execution(self)
    rr,logr,lr,gbr_model,gbc_model,rfc_model,rfr_model,sgd,dtc,dtr,la=Models.model_objects()
    model_info=self.model_info
    #finding out which model to run
    model_instance={}
    for name in model_info['design_state_data']['algorithms'].keys():
      model_instance[name]=model_info['design_state_data']['algorithms'][name]['is_selected']

    if model_instance['RandomForestRegressor']:
        param_grid={'n_estimators':np.arange(model_info['design_state_data']['algorithms']['RandomForestRegressor']["min_trees"],model_info['design_state_data']['algorithms']['RandomForestRegressor']["max_trees"]+1),
                    'max_depth':np.arange(model_info['design_state_data']['algorithms']['RandomForestRegressor']["min_depth"],model_info['design_state_data']['algorithms']['RandomForestRegressor']["max_depth"]+1),
                    'min_samples_leaf':np.arange(model_info['design_state_data']['algorithms']['RandomForestRegressor']["min_samples_per_leaf_min_value"],model_info['design_state_data']['algorithms']['RandomForestRegressor']["min_samples_per_leaf_max_value"]+1)}

        cv_rf_model=GridSearchCV(rfr_model, param_grid, scoring=None, n_jobs=model_info['design_state_data']["hyperparameters"]["parallelism"], refit=True, 
                                cv=model_info['design_state_data']["hyperparameters"]["num_of_folds"],
                                verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=False)

        cv_rf_model.fit(xtrain,ytrain)
        pred_y=cv_rf_model.predict(xtest)
        print(f"Random Forest Regressor  mean squared error is {round(mean_squared_error(ytest,pred_y),4)}")  
    
    elif model_instance['RandomForestClassifier']:
        param_grid={'n_estimators':np.arange(model_info['design_state_data']['algorithms']['RandomForestClassifier']["min_trees"],model_info['design_state_data']['algorithms']['RandomForestClassifier']["max_trees"]+1),
                    'max_depth':np.arange(model_info['design_state_data']['algorithms']['RandomForestClassifier']["min_depth"],model_info['design_state_data']['algorithms']['RandomForestClassifier']["max_depth"]+1),
                    'min_samples_leaf':np.arange(model_info['design_state_data']['algorithms']['RandomForestClassifier']["min_samples_per_leaf_min_value"],model_info['design_state_data']['algorithms']['RandomForestClassifier']["min_samples_per_leaf_max_value"]+1)}

        cv_rf_model=GridSearchCV(rfc_model, param_grid, scoring=None, n_jobs=model_info['design_state_data']["hyperparameters"]["parallelism"], refit=True, 
                                cv=model_info['design_state_data']["hyperparameters"]["num_of_folds"],
                                verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=False)


        cv_rf_model.fit(xtrain,ytrain)
        pred_y=cv_rf_model.predict(xtest)
        print(f"Random Forest classifier  accuracy is {accuracy_score(ytest,pred_y)}")

    elif model_instance['GBTClassifier']:
      param_grid={'subsample':np.arange(model_info['design_state_data']['algorithms']['GBTClassifier']["min_subsample"],model_info['design_state_data']['algorithms']['GBTClassifier']["max_subsample"]+1),
                  'max_depth':np.arange(model_info['design_state_data']['algorithms']['GBTClassifier']["min_depth"],model_info['design_state_data']['algorithms']['GBTClassifier']["max_depth"]+1),
                  'n_iter_no_change':np.arange(model_info['design_state_data']['algorithms']['GBTClassifier']["min_iter"],model_info['design_state_data']['algorithms']['GBTClassifier']["max_iter"]+1)}

      cv_gb_model=GridSearchCV(gbc_model, param_grid, scoring=None, n_jobs=model_info['design_state_data']["hyperparameters"]["parallelism"], refit=True, 
                              cv=model_info['design_state_data']["hyperparameters"]["num_of_folds"],
                              verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=False)

      cv_gb_model.fit(xtrain,ytrain)
      pred_y=cv_gb_model.predict(xtest)
      print(f"Gradient Boosted classifier  accuracy is {accuracy_score(ytest,pred_y)}")


    elif model_instance['GBTRegressor']:
      param_grid={'subsample':np.arange(model_info['design_state_data']['algorithms']['GBTRegressor']["min_subsample"],model_info['design_state_data']['algorithms']['GBTRegressor']["max_subsample"]+1),
                  'max_depth':np.arange(model_info['design_state_data']['algorithms']['GBTRegressor']["min_depth"],model_info['design_state_data']['algorithms']['GBTRegressor']["max_depth"]+1),
                  'n_iter_no_change':np.arange(model_info['design_state_data']['algorithms']['GBTRegressor']["min_iter"],model_info['design_state_data']['algorithms']['GBTRegressor']["max_iter"]+1)}

      cv_gb_model=GridSearchCV(gbr_model, param_grid, scoring=None, n_jobs=model_info['design_state_data']["hyperparameters"]["parallelism"], refit=True, 
                              cv=model_info['design_state_data']["hyperparameters"]["num_of_folds"],
                              verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=False)

      cv_gb_model.fit(xtrain,ytrain)
      pred_y=cv_gb_model.predict(xtest)
      print(f"Gradient Boosted Regressor  mean squared error is {round(mean_squared_error(ytest,pred_y),2)}")

    elif model_instance['LinearRegression']:
      lr.fit(xtrain,ytrain)
      pred_y=lr.predict(xtest)
      print(f"Linear Regression  mean squared error is {round(mean_squared_error(ytest,pred_y),2)}")

    elif model_instance['LogisticRegression']:
      param_grid={'max_iter':np.arange(model_info['design_state_data']['algorithms']['LogisticRegression']["min_iter"],model_info['design_state_data']['algorithms']['LogisticRegression']["max_iter"]+1),
                  }

      cv_logr_model=GridSearchCV(logr, param_grid, scoring=None, n_jobs=model_info['design_state_data']["hyperparameters"]["parallelism"], refit=True, 
                              cv=model_info['design_state_data']["hyperparameters"]["num_of_folds"],
                              verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=False)

      cv_logr_model.fit(xtrain,ytrain)
      pred_y=cv_logr_model.predict(xtest)
      print(f"Logistic Regression  accuracy is {accuracy_score(ytest,pred_y)}")

    elif model_instance['RidgeRegression']:
      param_grid={'max_iter':np.arange(model_info['design_state_data']['algorithms']["RidgeRegression"]["min_iter"],model_info['design_state_data']['algorithms']["RidgeRegression"]["max_iter"]+1),
                  }

      cv_rr_model=GridSearchCV(rr, param_grid, scoring=None, n_jobs=model_info['design_state_data']["hyperparameters"]["parallelism"], refit=True, 
                              cv=model_info['design_state_data']["hyperparameters"]["num_of_folds"],
                              verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=False)

      cv_rr_model.fit(xtrain,ytrain)
      pred_y=cv_rr_model.predict(xtest)

      print(f"Ridge Regressor  mean squared error is {round(mean_squared_error(ytest,pred_y),2)}")

    elif model_instance['LassoRegression']:
      param_grid={'max_iter':np.arange(model_info['design_state_data']['algorithms']["LassoRegression"]["min_iter"],model_info['design_state_data']['algorithms']["LassoRegression"]["max_iter"]+1),}
      cv_lr_model=GridSearchCV(la, param_grid, scoring=None, n_jobs=model_info['design_state_data']["hyperparameters"]["parallelism"], refit=True, 
                              cv=model_info['design_state_data']["hyperparameters"]["num_of_folds"],
                              verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=False)

      cv_lr_model.fit(xtrain,ytrain)
      pred_y=cv_lr_model.predict(xtest)
      print(f"Lasso Regressor  mean squared error is {round(mean_squared_error(ytest,pred_y),2)}")

    elif model_instance['DecisionTreeRegressor']:
      param_grid={'max_depth':np.arange(model_info['design_state_data']['algorithms']["DecisionTreeRegressor"]["min_depth"],model_info['design_state_data']['algorithms']["DecisionTreeRegressor"]["max_depth"]+1)}

      cv_dtr_model=GridSearchCV(dtr, param_grid, scoring=None, n_jobs=model_info['design_state_data']["hyperparameters"]["parallelism"], refit=True, 
                              cv=model_info['design_state_data']["hyperparameters"]["num_of_folds"],
                              verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=False)

      cv_dtr_model.fit(xtrain,ytrain)
      pred_y=cv_dtr_model.predict(xtest)
      print(f"Decision Tree Regression  mean square error is {round(mean_squared_error(ytest,pred_y),2)}")

    elif model_instance['DecisionTreeClassifier']:
      if model_info['design_state_data']['algorithms']["DecisionTreeClassifier"]["use_gini"]==False:
        criterion='entropy'
      else:
        criterion='gini'

      param_grid={'max_depth':np.arange(model_info['design_state_data']['algorithms']["DecisionTreeClassifier"]["min_depth"],model_info['design_state_data']['algorithms']["DecisionTreeClassifier"]["max_depth"]+1)}

      cv_dtc_model=GridSearchCV(dtc, param_grid, scoring=None, n_jobs=model_info['design_state_data']["hyperparameters"]["parallelism"], refit=True, 
                              cv=model_info['design_state_data']["hyperparameters"]["num_of_folds"],
                              verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=False)

      cv_dtc_model.fit(xtrain,ytrain)
      pred_y=cv_dtc_model.predict(xtest)
      print(f"Decision Tree classifier  accuracy is {accuracy_score(ytest,pred_y)}")

    return 1

if __name__ == '__main__':
    model_instance = Models()
    model_instance.model_prediction()
  

Random Forest Regressor  mean squared error is 0.0026
