In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cluster import KMeans
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost.sklearn import XGBRegressor

In [18]:
class AutoML_Kaggle:
    def __init__(self, typeMl, file_train,file_test, target):
        self.type_ml = typeMl
        self.target = target
        self.scaler = StandardScaler()
        self.logreg = LogisticRegression()
        self.best_estimator = None
        self.best_score = 0
        self.data_train = pd.read_csv(file_train)
        self.data_test = pd.read_csv(file_test)
        
    def _del_nan(self, X):
        #si les nan sont > 20%, on del
        for col in X.columns:
            if (X[col].isna().sum() * 100)/len(X) > 20:
                X = X.drop(col, axis = 1)
        return X
    
    def _replace_nan(self,X):
        for col in X.columns:
            if col in X.select_dtypes(include = [np.number]):
                mean = np.mean(X[col])
                X[col].fillna(mean, inplace = True)
            else:
                X[col].fillna(X[col].mode()[0], inplace = True)
        return X
    
    def _set_get_dummies(self, X):
        X = pd.get_dummies(X,drop_first= True)
        return X
    
    def _del_unused_cols(self, X):
        for col in  X.columns:
            if len(X[col].unique()) == len(X):
                X.drop(col, axis = 1)
        return X
    
    def _transform(self, X):
        return self.scaler.fit_transform(X)
    
    def _get_grid_search_cls(self):
        if self.type_ml == "classification":
            return  {"logreg":[LogisticRegression(),[
                {'penalty': ['l2'],'C':[0.1,0.6,1],
                  'multi_class':['ovr', 'multinomial'],'class_weight':['balanced', None],
                  'solver':['lbfgs','sag','newton-cg'],'max_iter':[1000],"random_state": [0]
                },
                {'penalty': ['l1','l2'],'C':[0.1,0.6,1],
                  'multi_class':['ovr'],'class_weight':['balanced', None],
                  'solver':['liblinear'],'max_iter':[1000],"random_state": [0]
                },]],
              "SVM":[SVC(),[
                {'C': [0.1,0.6,1,2],'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['linear'],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                },
                {'C': [0.1,0.6,1,2], 'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['rbf'],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                },
                {'C': [0.1,0.6,1,2], 'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['poly'], 'degree': [2,3,4,5,6,7],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                },
                {'C': [0.1,0.6,1,2,],'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['sigmoid'],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                }]],
              "RanFor":[RandomForestClassifier(),{
                   'n_estimators': [10,20],
                   "criterion": ["gini", "entropy"],
                   "max_depth": [8, 10, 12, None],
                   "min_samples_split": [2, 5],
                   "max_features": ["sqrt", "log2", None],
                   "bootstrap": [True, False],
                   "class_weight": ["balanced", "balanced_subsample", None],
                   "random_state": [0]
                   }]}
        else:
            return  {"linreg":[LinearRegression(),[{'fit_intercept':[True, False]}]],
              "SVM":[SVR(kernel = "linear"),[
                {'C': [0.1,0.6,1,2]}]],
              "RanFor":[RandomForestRegressor(),{
                   'n_estimators': [20,30],
                   "max_depth": [8, 10, 12, None],
                   "min_samples_split": [2, 5],
                   "max_features": ["sqrt", "log2", None],
                   "bootstrap": [True, False],
                   "random_state": [0]
                   }],
                    "XGBRegressor":[XGBRegressor(),{'nthread':[4], #when use hyperthread, xgboost may become slower
                  'objective':['reg:linear'],
                  'learning_rate': [.03,0.04, 0.05, .07], #so called `eta` value
                  'max_depth': [3,5, 6, 7],
                  'min_child_weight': [3,4,5],
                  'silent': [1],
                  'subsample': [0.7],
                  'colsample_bytree': [0.7],
                  'n_estimators': [500]}]} 
    
    def local_predict(self):
        if self.best_estimator == None:
            print("fit before predict.. ")
        else:
            return self.best_estimator.predict(self.data_test_trans)
        
    def fit(self):
        self.y = self.data_train[self.target]
        self.data_train.drop(self.target, axis = 1)
        self.full_data = self.data_train.append(self.data_test).reset_index(drop=True)
        self.full_data = self._del_unused_cols(self.full_data)
        self.full_data = self._del_nan(self.full_data)
        self.full_data = self._replace_nan(self.full_data)
        self.full_data = self._set_get_dummies(self.full_data)
        
        self.data_train = self.full_data[self.full_data.Id.isin(self.data_train["Id"].tolist())]
        self.data_test = self.full_data[self.full_data.Id.isin(self.data_test["Id"].tolist())]
        
        self.data_train_trans = self._transform(self.data_train)
        
        self.data_test_trans = self._transform(self.data_test)
        
        X_train, X_test, y_train, y_test = train_test_split(self.data_train_trans, self.y, test_size=0.20, random_state=42)
        
        models = self._get_grid_search_cls()
        for key,classificateur in models.items():
            estimat = classificateur[0]
            parameters = classificateur[1]
            clf = GridSearchCV(estimat, param_grid = parameters, return_train_score=True, cv = 5, n_jobs=-1)
            clf.fit(X_train, y_train)
            if clf.best_score_ > self.best_score:
                self.best_score = clf.best_score_
                self.best_estimator = clf.best_estimator_
                print(key,":",self.best_score,"%")

In [19]:
autoML = AutoML_Kaggle("regression", "house_pricestrain.csv","house prices_test.csv","SalePrice")
autoML.fit()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


SVM : 0.29581716825704424 %
RanFor : 0.8373963442215155 %


  if getattr(data, 'base', None) is not None and \


XGBRegressor : 0.8664586505372349 %


In [20]:
SalePrices = autoML.local_predict()

In [21]:
autoML.best_estimator

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0,
             importance_type='gain', learning_rate=0.07, max_delta_step=0,
             max_depth=3, min_child_weight=5, missing=None, n_estimators=500,
             n_jobs=1, nthread=4, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=1,
             subsample=0.7, verbosity=1)

In [15]:
len(autoML.data_test["Id"])

1459

In [22]:
data = {'SalePrice':SalePrices} 
submit = pd.DataFrame(data, index = autoML.data_test["Id"])
submit.to_csv("submission.csv")