In [11]:
# import libraries for class
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.5)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# sklearn models
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet, TweedieRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor 
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

In [12]:
class full_regression:
    """A class which automatically does many regression models and gridsearches for you. 
    Note: when you run a new model it will overwrite the previous model. You can access the current model with .model and .model_des.
    Other options:
    run_all = default True, if set to false the class will not automatically run any models
    standardize = default True, uses standard scaler and fit-transforms on train, transform on test if exists
    test_size = default 0.15, decide the side of your test set - set to 0 if dont want test
    folds = default 6, amount of folds for cross validation - integer > 1
    shuffle = default True, shuffle the data for test split and cross val
    stratify = default None, input the variable that you which to stratify the data by
    print_info = default True, print all of the results out every time you run a model
    save_it = default False, this adds functionality to be able to save down all model results into a
              dataframe, set as a global variable called model_tracker.
    comment = default None, This is a comment field for the model_tracker
    Go to readme for further information: https://github.com/LukeBetham/machine-learning-classes/blob/master/README.md
    Created by LukeBetham"""

    def __init__(self, X, y, run_all=True, standardize=True,
                 test_size=0.15, folds=6, shuffle=True, stratify=None,
                 print_info=True, save_it=False, comment=None):
        # Save settings to object
        self.folds = folds
        self.shuffle = shuffle
        self.stratify= stratify
        self.comment = comment
        self.save_it = save_it
        self.print_info = print_info
        if self.stratify is None:
            self.kfold = KFold(self.folds, shuffle=self.shuffle, random_state=66)
        else:
            self.kfold = StratifiedKFold(self.folds, shuffle=self.shuffle, random_state=66)
        # Option for bolding print text
        self.BOLD = '\033[1m'
        self.END = '\033[0m'
        # Create train-test split if selected
        self.X = X
        self.y = y
        self.test = test_size
        if self.test != 0:
            self.X, self.X_test, self.y, self.y_test, self.index_train, self.index_test = train_test_split(
                self.X, self.y, self.X.index, test_size=self.test, shuffle=self.shuffle, 
                stratify=self.stratify, random_state=66)
        # Standardise the data if selected
        if standardize == True:
            scaler = StandardScaler()
            self.X = pd.DataFrame(
                scaler.fit_transform(self.X), columns=self.X.columns)
            if self.test != 0:
                self.X_test = pd.DataFrame(
                    scaler.transform(self.X_test), columns=self.X.columns)
        # Run all models
        if run_all==True:
            self.Linear()
            self.Linear_elastic_net()
            self.Linear_generalised()
            self.SVM_regressor()
            self.KNN_regressor()
            self.Decision_tree_model()
            self.Random_forest_model()
            self.GradientBoosting()
            self.VotingRegressor()
            self.MLP_Neural_Net()

    def Linear(self, model=LinearRegression()):
        # Set up linear model
        self.model = model
        self.model_des = "Linear Regression Model"
        self.grid_multiple = 1
        self.model_calc()
        
    def Linear_elastic_net(self, model=ElasticNet()):
        # Set up linear model
        self.model = model
        self.model_des = "Linear Regression Model (with Elastic Net)"
        self.grid_multiple = 1
        self.model_calc()

    def Linear_generalised(self, model=TweedieRegressor()):
        # Set up linear model generalised - power=0: Normal, power = 1: Poisson, power = 2: Gamma, power = 3: Inverse Gaussian distribution.
        self.model = model
        self.model_des = "Generalised Linear Regression Model"
        self.grid_multiple = 1
        self.model_calc()
    
    def SVM_regressor(self, model=SVR()):
        # Set up SVM
        self.model = model
        self.model_des = "SVM Regression Model"
        self.grid_multiple = 1
        self.model_calc()
        
    def KNN_regressor(self, model=KNeighborsRegressor()):
        # Set up KNN 
        self.model = model
        self.model_des = "KNN Regression Model"
        self.grid_multiple = 1
        self.model_calc()
        
    def Decision_tree_model(self, print_tree=False, print_depth=5, model=DecisionTreeRegressor(random_state=66)):
        # set up decision tree model
        self.model = model
        self.model_des = "Decision Tree Model"
        self.grid_multiple = 1
        self.model_calc()
        if print_tree == True:
            dot_data = StringIO() 
            export_graphviz(self.model, out_file=dot_data, filled=True, rounded=True,
                            special_characters=True, feature_names=self.X.columns, max_depth=print_depth)  

            graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
            display(Image(graph.create_png()))

    def Random_forest_model(self, model=RandomForestRegressor(random_state=66)):
        self.model = model
        self.model_des = "Random Forest Model"
        self.grid_multiple = 1
        self.model_calc()
            
    def GradientBoosting(self, grad_model=GradientBoostingRegressor(random_state=66)):
        self.model = grad_model
        self.model_des = "Gradient Boosting Model"
        self.grid_multiple = 32.4
        self.model_calc()
    
    def VotingRegressor(self, model = VotingRegressor, vote_model1 = LinearRegression(), vote_model2 = RandomForestRegressor(random_state=66), vote_model3 = GradientBoostingRegressor(random_state=66)):
        reg1 = vote_model3
        reg2 = vote_model2
        reg3 = vote_model1
        self.model = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('lr', reg3)])
        self.model_des = "Voting Regressor Model (default Linear & RF & GB)"
        self.grid_multiple = 1
        self.model_calc()
    
    def MLP_Neural_Net(self, model = MLPRegressor(solver='adam', alpha=10**(0),
                                        hidden_layer_sizes=(10, 10, 10), activation='relu',
                                        random_state=66, batch_size=50,max_iter=500)):
        self.model = model
        self.model_des = 'MLP Regressor Neural Net'
        self.grid_multiple = 1
        self.model_calc()

    def model_calc(self):
        # fit model
        t0 = time.time()
        self.model.fit(self.X, self.y)
        self.sc = self.model.score(self.X, self.y)
        self.cvs = cross_val_score(self.model, self.X, self.y, cv=self.kfold).mean()
        self.rmse = -cross_val_score(self.model, self.X, self.y, cv=self.kfold, scoring="neg_root_mean_squared_error").mean()
        # Get test score
        if self.test != 0:
            self.sct = self.model.score(self.X_test, self.y_test)
            self.sctp = str(round(self.sct, 4))
        else:
            self.sct = 'No Test Set'
            self.sctp = 'No Test Set'
        # time the running of the model
        t1 = time.time()
        self.elaspsed = t1-t0
        # show the results from the classification model
        if self.print_info==True:
            print("\n",self.BOLD + self.model_des, 'Test\nTrain Score:' + self.END, round(self.sc, 4),
                  self.BOLD + '\nCV Fold Score (r^2):' +
                  self.END, round(self.cvs, 4),
                  self.BOLD + '\nCV Fold Score (RMSE):' +
                  self.END, round(self.rmse, 4),
                  self.BOLD + "\nModel Test Score:" + self.END, self.sctp)
            print("Time Elapsed = ", round(self.elaspsed, 2), 'secs - grid will take ~',
                  round(self.elaspsed*self.grid_multiple, 2), 'minutes to run.')
        try:
            self.coef = self.model.coef_
            self.coefs
        except:
            pass
        if self.save_it == True:
            self.tracking()
            print("Saved model to global var  = model_tracker")
            
    def tracking(self):
        global model_tracker
        df_temp = pd.DataFrame({'model_type':self.model_des,'model_train_score':self.sc,
                                'cv_score_r2':self.cvs,'cv_score_rmse': self.rmse,'test_score':self.sct,'predictors': str(','.join(self.X.columns)),
                                'model_params':str(self.model),'observations':len(self.X),'time':self.elaspsed,'comment':self.comment},index=[1])
        try:
            model_tracker =  pd.concat([model_tracker,df_temp])
        except:
            model_tracker = pd.DataFrame(columns = ['model_type','model_train_score','cv_score_r2','cv_score_rmse',
                                                    'test_score','predictors','model_params','observations','time',
                                                    'comment'])
            model_tracker =  pd.concat([model_tracker,df_temp])

    def gridsearch(self, params='default'):
        """A function which automatically runs a gridsearch on your selected model. Returns model_grid model with best parameters.
        Has default parameters for each model type, but you can set your own by passing a dict into params = {}
        """
        # setting the default parameters if not set by user
        if params == 'default':
            if self.model_des == "Logistic Regression Model":
                self.params = {'penalty': ['l1', 'l2', 'elasticnet'], 'solver': ['saga'], 'C': np.logspace(-5, 5, 5), 'l1_ratio': np.linspace(0.0001, 1, 4)}
            elif self.model_des == "K Neighbors Model":
                self.params = {'n_neighbors': range(1, 20, 1), 'weights': ['uniform', 'distance'], 'p': [1, 2]}
            elif self.model_des == "Decision Tree Model":
                self.params = {'criterion': ['gini', 'entropy'], 'max_depth': [None, 5, 6, 7, 8], 'max_features': ['auto'], 'splitter': [
                    'best', 'random'], 'min_samples_split': [2, 3, 4, 5], 'ccp_alpha': [0.0, 0.0001, 0.001, .01, .1, 1, 10, 100], 'class_weight': [None, 'balanced']}
            elif self.model_des == "Random Forest Model":
                self.params = {'n_estimators':[100,200,500], 'criterion':['gini','entropy'], 'max_depth':[None], 'min_samples_split':[2,6],"max_features":["auto","log2"],
                               'oob_score':[True,False],'warm_start':[True,False],'ccp_alpha':[0.0,0.5,1]}
            elif self.model_des == "ADA Boosting Model": 
                self.params = {"learning_rate": [0.05, 0.25, 0.5, 0.75, 1], 'base_estimator':[DecisionTreeClassifier(max_depth=1),DecisionTreeClassifier(max_depth=2),DecisionTreeClassifier(max_depth=3),DecisionTreeClassifier(max_depth=4),DecisionTreeClassifier(max_depth=5)],
                               'algorithm':['SAMME'],"n_estimators":[100,200,500,1000]}
            elif self.model_des == "Gradient Boosting Model": 
                self.params = {"learning_rate": [0.01, 0.5, 1], 'loss':['deviance', 'exponential'],'max_features':['auto','log2','sqrt'],
                               'warm_start':[True,False],"n_estimators":[100,200], 'ccp_alpha':[0.0,0.5,0.9],'max_depth':[1,3,5], 'subsample':[1.0,0.75,0.5]}
            elif self.model_des == "Naive Bayes Model":
                self.params = {"var_smoothing": [0.000001, 0.2, 0.4, 0.6, 0.8, 1]}
            elif self.model_des == "Linear Support Vectors Model": 
                self.params = {'C':np.linspace(-10,10,20),'loss':['hinge','squared_hinge']}
            elif self.model_des == "Gaussian (rbf) Support Vectors Model": 
                self.params = {'C':np.linspace(-10,10,15),'gamma':np.linspace(0.00001,100,15),'kernel':['rbf']}
            elif self.model_des == "Polynomial Support Vectors Model": 
                self.params = {'C':np.linspace(-10,10,15),'coef0':[0,1,2,3,4,10],'kernel':['poly'],'degree':[2,3,4]}
            elif self.model_des == "MLP Classifier Neural Net": 
                self.params = {'solver':['adam','sgd'], 'alpha': np.linspace(0.00001,1,4),'hidden_layer_sizes':[(10, 10, 10,10),(20, 20, 20),(50,50),(100)],
                               'learning_rate' : ['constant', 'invscaling', 'adaptive'],'activation' : ['identity', 'logistic', 'tanh', 'relu']}
        else:
            self.params = params
        # setup the gridsearch
        self.grid = GridSearchCV(self.model, self.params, verbose=1, cv=StratifiedKFold(
            self.folds, shuffle=self.shuffle, random_state=66))
        self.grid.fit(self.X, self.y)
        self.gsc = self.grid.best_score_
        self.best = self.grid.best_params_
        self.model = self.grid.best_estimator_
        self.model_des = self.model_des + " Grid Search:"
        try:
            self.coef = self.grid.best_estimator_.coef_
        except:
            pass
        # Check test score for grid
        try:
            self.sct = self.grid.best_estimator_.score(
                self.X_test, self.y_test)
            self.sctp = str(round(self.sct, 4))
        except:
            self.sctp = None
        # Print Grid results
        if self.print_info==True:
            print(self.BOLD + self.model_des + self.END)
            print(self.BOLD + "Best Mean CV Model Score:" + self.END, round(self.gsc, 4),
                  self.BOLD + "\nModel Test Score:" + self.END, self.sctp)
            print(self.BOLD + 'Grid Best Parameters:\n' + self.END, self.best)
            print(self.BOLD + '\nSearch Parameters:\n' + self.END, self.params)
        self.coefs()




In [13]:
df = pd.read_csv("train_amended.csv")
y = df.pop("y")
X = df.iloc[:,2:]

In [14]:
X

Unnamed: 0,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,"Intergalactic Development Index (IDI), Rank",Population using at least basic drinking-water services (%),...,Current health expenditure (% of GGP),"Intergalactic Development Index (IDI), female","Intergalactic Development Index (IDI), male",Gender Development Index (GDI),"Intergalactic Development Index (IDI), female, Rank","Intergalactic Development Index (IDI), male, Rank",Adjusted net savings,"Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total",Private galaxy capital flows (% of GGP),Gender Inequality Index (GII)
0,0.689316,65.638091,15088.072865,0.886753,15.136626,12.777039,0.751931,0.848223,157.189161,110.010529,...,11.294665,0.812198,0.752921,0.996141,114.110048,156.344873,9.792015,18.591266,30.336116,0.503575
1,0.690273,62.438111,22524.652152,0.520520,12.659825,7.604448,0.608262,0.577814,189.642778,63.094446,...,5.540216,0.505484,0.636787,0.900665,174.950196,166.649398,21.369380,2.258941,20.659666,0.824864
2,1.055824,87.572474,35478.761084,0.889595,18.034259,12.481260,0.951648,0.967120,100.924078,108.922466,...,9.452410,0.975801,0.911907,1.006970,90.403900,55.715254,19.438685,3.488755,28.197710,0.602528
3,1.061596,85.128977,21120.552199,0.679580,15.279956,13.228658,0.779531,0.950648,123.076566,123.088027,...,9.620653,0.834897,0.837501,1.059593,160.459662,145.505845,4.419405,2.969704,15.128145,0.659367
4,0.691074,65.679071,20590.608266,0.545087,9.581990,6.505547,0.580999,0.432198,178.118196,81.211784,...,5.323853,0.531498,0.626837,0.813624,146.877670,161.677601,18.719322,7.723378,40.966939,0.764898
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,0.942393,77.787047,14458.508309,0.677131,13.516310,14.937129,0.916229,0.781201,122.783465,92.271993,...,9.602095,0.727922,0.918455,0.898370,134.446206,140.396767,20.172121,5.853958,14.871016,0.568084
330,0.753508,63.168599,16167.944393,0.563057,14.673566,5.835209,0.617273,0.571536,243.115606,58.373749,...,8.308261,0.631838,0.702224,0.942951,168.404814,186.253437,3.013558,14.472500,-3.240981,0.702192
331,1.029704,82.832063,34310.471408,0.855094,18.578586,10.557143,0.906573,0.862826,144.896214,116.585709,...,10.392312,0.943410,0.902237,1.060532,124.564121,143.907576,26.438719,3.023709,29.294865,0.580785
332,0.937869,75.877098,36899.067719,0.929494,16.153857,9.151665,0.865822,0.747577,164.692000,121.672753,...,10.296360,0.915225,0.798083,1.055118,163.664516,184.291155,20.637654,4.470596,31.085400,0.517558


In [15]:
reg_model = full_regression(X, y, run_all = True, standardize=True, test_size=0, folds=3, save_it=True, comment = "First run with no nulls")

NameError: name 'StandardScaler' is not defined

In [None]:
model_tracker