In [24]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cluster import KMeans
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [27]:
class AutoML:
    def __init__(self, typeMl, file, target):
        self.type_ml = typeMl
        self.target = target
        self.scaler = StandardScaler()
        self.logreg = LogisticRegression()
        self.best_estimator = None
        self.best_score = 0
        self.data = pd.read_csv(file)
        self.data_cache = None
    
    def _set_X_y(self):
        y = self.data[self.target]
        X = self.data.drop(self.target, axis=1)
        return X, y
        
    def _del_unused_cols(self, X):
        for col in  X.columns:
            if len(X[col].unique()) == len(X):
                X.drop(col, axis = 1)
        return X
    
    def _del_nan(self, X):
        #si les nan sont > 20%, on del
        for col in X.columns:
            if (X[col].isna().sum() * 100)/len(X) > 20:
                X = X.drop(col, axis = 1)
        return X
    
    def _replace_nan(self,X):
        for col in X.columns:
            if col in X.select_dtypes(include = [np.number]):
                mean = np.mean(X[col])
                X[col].fillna(mean, inplace = True)
            else:
                X[col].fillna(X[col].mode()[0], inplace = True)
        return X
    
    
    def _set_get_dummies(self, X):
        X = pd.get_dummies(X,drop_first= True)
        return X
                
    def _transform(self, X):
        #self.X_trans = self.scaler.fit_transform(X)
        return self.scaler.fit_transform(X)
    
    def preprocess_X(self, X):
        X = self._del_unused_cols(X)
        X = self._del_nan(X)
        X = self._replace_nan(X)
        X = self._set_get_dummies(X)
        X = self._transform(X)
        return X
    
    
    def preprocess(self):
        X, y = self._set_X_y()
        X = self.preprocess_X(X)
        return X, y 
    
    def split(self):
        X, y = self.preprocess()
        return train_test_split(X, y, test_size=0.20, random_state=42)
    
        
    def local_predict(self, X_test):
        if self.best_estimator == None:
            print("fit before predict.. ")
        else:
            return self.best_estimator.predict(X_test)
    
    def external_predict(self,X_test):
        if self.best_estimator == None:
            print("fit before predict.. ")
        else:
            X_test = self.preprocess_X(X_test)
            return self.best_estimator.predict(X_test)
        
    def get_accuracy(self,y_true, y_pred):
        return  accuracy_score(y_true, y_pred)
    
    def get_rmse(self,y_true, y_pred):
        return mean_squared_error(y_true, y_pred)
                
    
    def fit(self, X, y):
        models = self._get_grid_search_cls()
        for key,classificateur in models.items():
            estimat = classificateur[0]
            parameters = classificateur[1]
            clf = GridSearchCV(estimat, param_grid = parameters, return_train_score=True, cv = 5, n_jobs=-1)
            clf.fit(X, y)
            if clf.best_score_ > self.best_score:
                self.best_score = clf.best_score_
                self.best_estimator = clf.best_estimator_
                print(key,":",self.best_score,"%")
                    
   
    def _get_grid_search_cls(self):
        if self.type_ml == "classification":
            return  {"logreg":[LogisticRegression(),[
                {'penalty': ['l2'],'C':[0.1,0.6,1],
                  'multi_class':['ovr', 'multinomial'],'class_weight':['balanced', None],
                  'solver':['lbfgs','sag','newton-cg'],'max_iter':[1000],"random_state": [0]
                },
                {'penalty': ['l1','l2'],'C':[0.1,0.6,1],
                  'multi_class':['ovr'],'class_weight':['balanced', None],
                  'solver':['liblinear'],'max_iter':[1000],"random_state": [0]
                },]],
              "SVM":[SVC(),[
                {'C': [0.1,0.6,1,2],'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['linear'],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                },
                {'C': [0.1,0.6,1,2], 'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['rbf'],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                },
                {'C': [0.1,0.6,1,2], 'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['poly'], 'degree': [2,3,4,5,6,7],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                },
                {'C': [0.1,0.6,1,2,],'class_weight':['balanced', None],
                 'gamma': ['scale','auto'], 'kernel': ['sigmoid'],
                 'decision_function_shape': ['ovo', 'ovr'],"random_state": [0]
                }]],
              "RanFor":[RandomForestClassifier(),{
                   'n_estimators': [10,20],
                   "criterion": ["gini", "entropy"],
                   "max_depth": [8, 10, 12, None],
                   "min_samples_split": [2, 5],
                   "max_features": ["sqrt", "log2", None],
                   "bootstrap": [True, False],
                   "class_weight": ["balanced", "balanced_subsample", None],
                   "random_state": [0]
                   }]}
        else:
            return  {"linreg":[LinearRegression(),[{'fit_intercept':[True, False]}]],
              "SVM":[SVR(kernel = "linear"),[
                {'C': [0.1,0.6,1,2]}]],
              "RanFor":[RandomForestRegressor(),{
                   'n_estimators': [20,30],
                   "max_depth": [8, 10, 12, None],
                   "min_samples_split": [2, 5],
                   "max_features": ["sqrt", "log2", None],
                   "bootstrap": [True, False],
                   "random_state": [0]
                   }]} 

In [19]:
autoML = AutoML("classification", "Social_Network_Ads.csv","Purchased")
X_train, X_test, y_train, y_test = autoML.split()
autoML.fit(X_train,y_train)

logreg : 0.834375 %
SVM : 0.909375 %


In [20]:
y_pred = autoML.local_predict(X_test)

In [21]:
autoML.get_accuracy(y_pred, y_test)

0.9125

In [22]:
autoML.best_estimator

SVC(C=2, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [28]:
#house_pricestrain.csv
autoML = AutoML("regression", "house_pricestrain.csv","SalePrice")
X_train, X_test, y_train, y_test = autoML.split()
autoML.fit(X_train,y_train)



SVM : 0.29580543007784277 %
RanFor : 0.8447123548835395 %


In [30]:
y_pred = autoML.local_predict(X_test)
autoML.get_rmse(y_pred, y_test)

1041984840.016786

In [38]:
test_data = pd.read_csv("house prices_test.csv")
test_data_y_pred = autoML.external_predict(test_data)
print(test_data_y_pred)
#test_data.shape
#autoML2 = AutoML("regression", "house prices_test.csv","SalePrice")
#autoML2.preprocess_without_split()
#autoML2.data_cache.head()

ValueError: Number of features of the model must match the input. Model n_features is 233 and input n_features is 217 

In [19]:
result = autoML.best_estimator.predict(autoML2.data_cache)

ValueError: Number of features of the model must match the input. Model n_features is 219 and input n_features is 155 

In [6]:
xxx = pd.read_csv("house_pricestrain.csv")
xxx["MasVnrType"]

0       BrkFace
1          None
2       BrkFace
3          None
4       BrkFace
         ...   
1455       None
1456      Stone
1457       None
1458       None
1459       None
Name: MasVnrType, Length: 1460, dtype: object

In [42]:
xxx1 = pd.read_csv("house prices_test.csv")
xxx1

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal
