# Automated Machine Learning

Similar to AutoML and pycaret, the objective of this project is to create an automated machine learning library

In [1]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.ensemble import *
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import *
from sklearn.datasets import make_classification
from threading import Thread
import math
import inspect
import numpy as np
import pandas as pd

In [3]:
df1 = make_classification(n_samples = 100, n_features = 100)

In [4]:
X = pd.DataFrame(df1[0], columns = range(1,101))
y = pd.DataFrame(df1[1], columns = ["Response"])
df1 = X.join(y)

In [5]:
class AML:
    def __init__(self, df, feature, response, response_type):
        self.df = df
        self.features = self.df.loc[:,feature]
        self.response = self.df.loc[:,response]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.features, self.response)
        self.random_state = 42
        self.model_df = []
        self.response_type = response_type
        if len(pd.Series(self.response).unique()) == 2:
            self.average = "binary"
        else:
            self.average = "micro"
        
    ###--------------------------------------------------------
    ### METRICS CALCULATION ###
    ###--------------------------------------------------------
       
    def regression_metrics(self, model):
        model.fit(self.X_train, self.y_train)
        prediction = model.predict(self.X_test)
        mse = mean_squared_error(self.y_test, prediction)
        mae = mean_absolute_error(self.y_test, prediction)
        mape = mean_absolute_percentage_error(self.y_test, prediction)
        rmse = math.sqrt(mse)
        self.model_df.append([model.__class__.__name__, round(mse,4), round(mae,4), round(mape,4), round(rmse,4)])
    
    def classification_metrics(self, model):
        model.fit(self.X_train, self.y_train)
        prediction = model.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, prediction)
        precision = precision_score(self.y_test, prediction, average = self.average)
        recall = recall_score(self.y_test, prediction, average = self.average)
        f1 = f1_score(self.y_test, prediction, average = self.average)
        self.model_df.append([model.__class__.__name__, round(accuracy,4), round(precision,4), round(recall,4), round(f1,4)])
        
    ###--------------------------------------------------------
    ### REGRESSION MODELS ###
    ###--------------------------------------------------------
    def linear_reg(self):
        model = LinearRegression()
        self.regression_metrics(model)
    
    def ridge_reg(self):
        model = Ridge(random_state=self.random_state)
        self.regression_metrics(model)
    
    def lasso_reg(self):
        model = Lasso(random_state=self.random_state)
        self.regression_metrics(model)
        
    def elasticNet_reg(self):
        model = ElasticNet(random_state=self.random_state)
        self.regression_metrics(model)
    
    def RFRegressor(self):
        model = RandomForestRegressor(random_state=self.random_state)
        self.regression_metrics(model)
    
    def DTRegressor(self):
        model = DecisionTreeRegressor(random_state=self.random_state)
        self.regression_metrics(model)
    
    def adaBoostRegressor(self):
        model = AdaBoostRegressor(random_state=self.random_state)
        self.regression_metrics(model)
    
    def gradBoostRegressor(self):
        model = GradientBoostingRegressor(random_state=self.random_state)
        self.regression_metrics(model)
    
    def extraTreeRegressor(self):
        model = ExtraTreesRegressor(random_state=self.random_state)
        self.regression_metrics(model)
    ###--------------------------------------------------------
    ### CLASSIFICATION MODELS
    ###--------------------------------------------------------
    def log_reg(self):
        model = LogisticRegression(random_state=self.random_state)
        self.classification_metrics(model)
    
    def RFClassifier(self):
        model = RandomForestClassifier(random_state=self.random_state)
        self.classification_metrics(model)
    
    def DTClassifier(self):
        model = DecisionTreeClassifier(random_state=self.random_state)
        self.classification_metrics(model)
    
    def adaBoostClassifier(self):
        model = AdaBoostClassifier(random_state=self.random_state)
        self.classification_metrics(model)
    
    def gradBoostClassifier(self):
        model = GradientBoostingClassifier(random_state=self.random_state)
        self.classification_metrics(model)
    
    def extraTreeClassifier(self):
        model = ExtraTreesClassifier(random_state=self.random_state)
        self.classification_metrics(model)
    
    ###--------------------------------------------------------
    ### RUN ALL APPROPRIATE MODELS
    ###--------------------------------------------------------
    def run(self):
        if self.response_type == "continuous":
            linearRegThread = Thread(target = self.linear_reg())
            ridgeRegThread = Thread(target = self.ridge_reg())
            lassoRegThread = Thread(target = self.lasso_reg())
            elasticNetRegThread = Thread(target = self.elasticNet_reg())
            RFRegressorThread = Thread(target = self.RFRegressor())
            DTRegressorThread = Thread(target = self.DTRegressor())
            adaBoostRegressorThread = Thread(target = self.adaBoostRegressor())
            gradBoostRegressorThread = Thread(target = self.gradBoostRegressor())
            extraTreeRegressorThread = Thread(target = self.extraTreeRegressor())
            
            threads = [linearRegThread, ridgeRegThread, lassoRegThread, elasticNetRegThread, 
                    RFRegressorThread, DTRegressorThread, adaBoostRegressorThread, gradBoostRegressorThread, 
                    extraTreeRegressorThread]
            columns = ["Model", "MSE", "MAE", "MAPE", "RMSE"]
            
        elif self.response_type == "categorical":
            log_regThread = Thread(target = self.log_reg())
            RFClassifierThread = Thread(target = self.RFClassifier())
            DTClassifierThread = Thread(target = self.DTClassifier())
            adaBoostClassifierThread = Thread(target = self.adaBoostClassifier())
            gradBoostClassifierThread = Thread(target = self.gradBoostClassifier())
            extraTreeClassifierThread = Thread(target = self.extraTreeClassifier())
            
            threads = [log_regThread, RFClassifierThread, DTClassifierThread, adaBoostClassifierThread, gradBoostClassifierThread, extraTreeClassifierThread]
            columns = ["Model", "MSE", "MAE", "MAPE", "RMSE"]
        for thread in threads:
            thread.start()
            thread.join()
        return pd.DataFrame(self.model_df, columns = columns).sort_values(by = columns[1])


        

In [7]:
autoML1 = AML(df1, range(1,101), "Response", "categorical")

In [8]:
%time
autoML1.run()

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 11 µs


Unnamed: 0,Model,MSE,MAE,MAPE,RMSE
0,LogisticRegression,0.84,0.8462,0.8462,0.8462
1,RandomForestClassifier,0.96,0.9286,1.0,0.963
2,DecisionTreeClassifier,0.96,0.9286,1.0,0.963
3,AdaBoostClassifier,0.96,0.9286,1.0,0.963
4,GradientBoostingClassifier,0.96,0.9286,1.0,0.963
5,ExtraTreesClassifier,1.0,1.0,1.0,1.0
