In [59]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cluster import KMeans
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [68]:
class AutoML:
    def __init__(self, typeMl, file, target):
        self.type_ml = typeMl
        self.target = target
        self.scaler = StandardScaler()
        self.logreg = LogisticRegression()
        self.best_estimator = None
        self.best_score = 0
        self.data = pd.read_csv(file)
    
    def _del_unused_cols(self):
        for col in  self.data.columns:
            if len(self.data[col].unique()) == len(self.data):
                self.data.drop(col, axis = 1)
    
    def _del_nan(self):
        #si les nan sont > 20%, on del
        for col in self.data.columns:
            if (self.data[col].isna().sum() * 100)/len(self.data) > 20:
                self.data = self.X.drop(col, axis = 1)
    
    def _replace_nan(self):
        for col in self.data.columns:
            if col in self.data.select_dtypes(include = [np.number]):
                mean = np.mean(self.data[col])
                self.data[col].fillna(mean, inplace = True)
            else:
                self.data[col].fillna(self.data[col].mode, inplace = True)
    
    def _set_X_y(self):
        self.y = self.data[self.target]
        self.X = self.data.drop(self.target, axis=1)
    
    def _set_get_dummies(self):
        for col in self.X.columns:
            if col in self.X.select_dtypes(include = [np.object]):
                self.X = pd.concat([self.X,pd.get_dummies(self.X[col])], axis=1)
                self.X = self.X.drop(col, axis = 1)
                
    def _transform(self):
        self.X_trans = self.scaler.fit_transform(self.X)
            
    def pre_processing(self):
        
        self._del_unused_cols()
        self._del_nan()
        self._replace_nan()
        self._set_X_y()
        self._set_get_dummies()
        self._transform()
        
        return train_test_split(self.X_trans, self.y, test_size=0.20, random_state=42)
    
    def predict(self, X_test):
        if self.best_estimator == None:
            print("fit before predict.. ")
        else:
            return self.best_estimator.predict(X_test)
    
    
    def get_accuracy(self,y_true, y_pred):
        return  accuracy_score(y_true, y_pred)
    
    
    def fit(self, X, y):
        models = self._get_grid_search_cls(self.type_ml)
        for key,classificateur in models.items():
            estimat = classificateur[0]
            parameters = classificateur[1]
            clf = GridSearchCV(estimat, param_grid = parameters, return_train_score=True, cv = 5, n_jobs=-1)
            clf.fit(X, y)
            if clf.best_score_ > self.best_score:
                self.best_score = clf.best_score_
                self.best_estimator = clf.best_estimator_
                print(key,":",self.best_score,"%")
                    
   
    def _get_grid_search_cls(self, gs_type):
        if gs_type == "classification":
            return  {"logreg":[LogisticRegression(),[
                {'penalty': ['l2'],'C':[0.1,0.6,1],
                  'multi_class':['ovr', 'multinomial'],'class_weight':['balanced', None],
                  'solver':['lbfgs','sag','newton-cg'],'max_iter':[1000],"random_state": [0]
                },
                {'penalty': ['l1','l2'],'C':[0.1,0.6,1],
                  'multi_class':['ovr'],'class_weight':['balanced', None],
                  'solver':['liblinear'],'max_iter':[1000],"random_state": [0]
                },]],
              "SVM":[SVC(),[
                {'C': [0.1,0.6,1,2],'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['linear'],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                },
                {'C': [0.1,0.6,1,2], 'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['rbf'],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                },
                {'C': [0.1,0.6,1,2], 'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['poly'], 'degree': [2,3,4,5,6,7],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                },
                {'C': [0.1,0.6,1,2,],'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['sigmoid'],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                }]],
              "RanFor":[RandomForestClassifier(),{
                   'n_estimators': [10,20],
                   "criterion": ["gini", "entropy"],
                   "max_depth": [8, 10, 12, None],
                   "min_samples_split": [2, 5],
                   "max_features": ["sqrt", "log2", None],
                   "bootstrap": [True, False],
                   "class_weight": ["balanced", "balanced_subsample", None],
                   "random_state": [0]
                   }]}
        else:
            return  {"linreg":[LinearRegression(),[
                {'penalty': ['l2'],'C':[0.1,0.6,1],
                  'multi_class':['ovr', 'multinomial'],'class_weight':['balanced', None],
                  'solver':['lbfgs','sag','newton-cg'],'max_iter':[1000],"random_state": [0]
                },
                {'penalty': ['l1','l2'],'C':[0.1,0.6,1],
                  'multi_class':['ovr'],'class_weight':['balanced', None],
                  'solver':['liblinear'],'max_iter':[1000],"random_state": [0]
                },]],
              "SVM":[SVC(kernel = "linear"),[
                {'C': [0.1,0.6,1,2],'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['linear'],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                },
                {'C': [0.1,0.6,1,2], 'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['rbf'],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                },
                {'C': [0.1,0.6,1,2], 'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['poly'], 'degree': [2,3,4,5,6,7],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                },
                {'C': [0.1,0.6,1,2,],'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['sigmoid'],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                }]],
              "RanFor":[RandomForestRegressor(),{
                   'n_estimators': [100,150],
                   "criterion": ["gini", "entropy"],
                   "max_depth": [8, 10, 12, None],
                   "min_samples_split": [2, 5],
                   "max_features": ["sqrt", "log2", None],
                   "bootstrap": [True, False],
                   "random_state": [0]
                   }]}

In [69]:
autoML = AutoML("classification", "Social_Network_Ads.csv","Purchased")
X_train, X_test, y_train, y_test = autoML.pre_processing()
autoML.fit(X_train,y_train)

logreg : 0.834375 %
SVM : 0.90625 %
RanFor : 0.909375 %


In [63]:
y_pred = autoML.predict(X_test)

In [64]:
autoML.get_accuracy(y_pred, y_test)

0.9

In [65]:
autoML.best_estimator

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)