In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import scikitplot as skplt

sns.set(font_scale=1.5)
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

from sklearn import datasets, metrics
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold, GridSearchCV, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from matplotlib.colors import ListedColormap


def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn

In [3]:
class full_classification:
    """A class which automatically does all classification models and gridsearches for you (logisitic default). Remember to input baseline figure and decide if you want standardisation.
    Note: when you run a new model it will overwrite the previous model. You can access the current model with .model and .model_des.
    Created by LukeBetham"""

    def __init__(self, X, y, baseline=0, standardize="none", test_size=0.15, folds=6, shuffle=True):

        # Set up the KFolds
        self.folds = folds
        self.shuffle = shuffle
        # Option for bolding print text
        self.BOLD = '\033[1m'
        self.END = '\033[0m'
        # Create train-test if selected
        self.X = X
        self.y = y
        self.baseline = baseline
        self.test = test_size
        if self.test != 0:
            self.X, self.X_test, self.y, self.y_test = train_test_split(
                self.X, self.y, test_size=self.test, random_state=66)
        # Standardise the data if selected
        if standardize != 'none':
            scaler = StandardScaler()
            self.X = pd.DataFrame(
                scaler.fit_transform(self.X), columns=X.columns)
            if self.test != 0:
                self.X_test = pd.DataFrame(
                    scaler.transform(self.X_test), columns=X.columns)
        # Call the 3 standard models
        self.knn_model(5)
        self.decision_tree_model()
        self.logistic_model()

    def logistic_model(self, Logistic=LogisticRegression(penalty='none', max_iter=1000)):
        # Set up Logistic Regresssion
        self.model = Logistic
        self.model_des = "Logistic Regression Model"
        self.model_calc()
        print("Run .coefs() to see coef dataframe\nTime Elapsed = ", round(self.elaspsed, 2),
              'secs - grid will take ~', round(self.elaspsed*30, 2), 'minutes to run.\n')

    def knn_model(self, k='all', weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):
        if k != 'all':
            # set up KNN model
            self.model = KNeighborsClassifier(n_neighbors=k, weights=weights, algorithm=algorithm,
                                              leaf_size=leaf_size, p=p, metric=metric, metric_params=metric_params, n_jobs=n_jobs)
            self.model_des = "K Neighbors Model"
            self.model_calc()
            print("Set k='all' to run full set of ks and graph.\nTime Elapsed = ", round(self.elaspsed, 2), 'secs - grid will take ~', round(
                self.elaspsed*7, 2), 'minutes to run - and all ks', round(int(len(self.y)*(1-(1/self.folds))-1)*self.elaspsed/120, 2), 'mins\n')

        else:
            # run KNN for all possible Ks and graph them
            self.scores = []
            self.max_k = int(len(self.y)*(1-(1/self.folds))-1)
            for k in range(1, self.max_k):
                knn = KNeighborsClassifier(n_neighbors=k, weights=weights, algorithm=algorithm,
                                           leaf_size=leaf_size, p=p, metric=metric, metric_params=metric_params, n_jobs=n_jobs)
                self.scores.append(np.mean(cross_val_score(knn, self.X, self.y, cv=KFold(
                    self.folds, shuffle=self.shuffle, random_state=66))))
            self.knn_best = self.scores.index(np.max(self.scores))+1
            plt.plot(range(1, self.max_k), self.scores, label='Mean CV Scores')
            plt.hlines(self.baseline, 1, self.max_k, label='baseline')
            plt.xlabel('k')
            plt.ylabel('accuracy')
            plt.legend(loc=[1.1, 0])
            print(self.BOLD + "Highest KNN Score:" + self.END, self.knn_best)
            plt.show()

    def decision_tree_model(self, print_tree="y", DecisionTree=DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, random_state=66)):
        # set up decision tree model
        self.model = DecisionTree
        self.model_des = "Decision Tree Model"
        self.model_calc()
        print("Time Elapsed = ", round(self.elaspsed, 2), 'secs - grid will take ~',
              round(self.elaspsed*50, 2), 'minutes to run.\n')
        if print_tree == 'y':
            # insert code to view tree here
            pass
        else:
            pass

    def random_forest_model(self, forest=RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, random_state=66)):
        self.model = forest
        self.model_des = "Random Forest Model"
        self.model_calc()
        print("Time Elapsed = ", round(self.elaspsed, 2), 'secs - grid will take ~',
              round(self.elaspsed*9.5, 2), 'minutes to run.\n')

    def boosting_model(self, plot_it=True, estimators=100, base_estimator=DecisionTreeClassifier(max_depth=3,random_state=66)):
        self.model = AdaBoostClassifier(
            base_estimator=base_estimator, n_estimators=estimators, algorithm='SAMME', random_state=66)
        self.model_des = "Boosting Model"
        self.model_calc()
        print("Time Elapsed = ", round(self.elaspsed, 2), 'secs - grid will take ~',
              round(self.elaspsed*2.5, 2), 'minutes to run.\n')
        # plot
        if plot_it == True:
            plt.plot(list(self.model.staged_score(self.X, self.y)),
                     label='training score', lw=2)
            plt.plot(list(self.model.staged_score(
                self.X_test, self.y_test)), label='test score', lw=2)
            plt.xlabel('iteration')
            plt.ylabel('score')
            plt.legend()
            plt.show()

    def coefs(self):
        self.dfc = pd.DataFrame(self.coef, columns=self.X.columns)
        return self.dfc

    def model_calc(self):
        # fit model
        t0 = time.time()
        self.model.fit(self.X, self.y)
        self.sc = self.model.score(self.X, self.y)
        self.cvs = cross_val_score(self.model, self.X, self.y, cv=KFold(
            self.folds, shuffle=self.shuffle, random_state=66)).mean()
        # Get test score
        if self.test != 0:
            self.sct = self.model.score(self.X_test, self.y_test)
            self.sctp = str(round(self.sct, 4))+" - better than baseline by " + \
                str(round(self.sct-self.baseline, 4))
        else:
            self.sctp = None
        t1 = time.time()
        self.elaspsed = t1-t0
        # show the results from the classification model
        print(self.BOLD + self.model_des, 'Test\nModel Score:' + self.END, round(self.sc, 4), "- better than baseline by", round(self.sc-self.baseline, 4),
              self.BOLD + '\nCV Fold Score:' +
              self.END, round(
                  self.cvs, 4), "- better than baseline by", round(self.cvs-self.baseline, 4),
              self.BOLD + "\nModel Test Score:" + self.END, self.sctp)
        try:
            self.coef = self.model.coef_
            self.coefs
        except:
            pass
        print("Use .gridsearch() to run full regularisation tests using all default for current model.",
              "\nUse .knn_model() or .logistic_model() or .decision_tree_model() to change model and specify paramters.")

    def gridsearch(self, params='default'):
        """A function which automatically runs a gridsearch on your selected model. Returns model_grid model with best parameters.
        Defaults for Logistic (600 iterations): {'penalty': ['l1', 'l2', 'elasticnet'], 'solver': ['saga'], 'C': np.logspace(-5, 5, 5), 'l1_ratio': np.linspace(0.0001, 1, 4)}
        Defaults for KNN: self.params (100 iterations) = {'n_neighbors':range(1,20,1), 'weights':['uniform','distance'], 'p':[1,2]}         
        Defaults for Decision Tree (1000 iterations) = {'criterion':['gini','entropy'],'max_depth': [None,5,6,7,8],'max_features':['auto'],'splitter':['best','random'],'min_samples_split':[2,3,4,5],'ccp_alpha':[0.0,0.0001,0.001,.01,.1,1,10,100],'class_weight':[None,'balanced']}        
        Defaults for Random Forest (575 iterations) = {'n_estimators':[100,200,500], 'criterion':['gini':'entropy'], 'max_depth':[None], 'min_samples_split':[2,4,6],"max_features":["auto","log2"],'oob_score':[True,False],'warm_start':[True,False],'ccp_alpha'=[0.0,0.5,1,10]}      
        Defaults for Boosting Model  (150 iterations)  = {"learning_rate": [0.05, 0.25, 0.5, 0.75, 1], "max_depth":[1,2,3,4,5],"max_features":["auto","log2"],"n_estimators":[100,200,500]}      
                """
        # setting the default parameters if not set by user
        if params == 'default':
            if self.model_des == "Logistic Regression Model":
                self.params = {'penalty': ['l1', 'l2', 'elasticnet'], 'solver': ['saga'], 'C': np.logspace(-5, 5, 5), 'l1_ratio': np.linspace(0.0001, 1, 4)}
            elif self.model_des == "K Neighbors Model":
                self.params = {'n_neighbors': range(1, 20, 1), 'weights': ['uniform', 'distance'], 'p': [1, 2]}
            elif self.model_des == "Decision Tree Model":
                self.params = {'criterion': ['gini', 'entropy'], 'max_depth': [None, 5, 6, 7, 8], 'max_features': ['auto'], 'splitter': [
                    'best', 'random'], 'min_samples_split': [2, 3, 4, 5], 'ccp_alpha': [0.0, 0.0001, 0.001, .01, .1, 1, 10, 100], 'class_weight': [None, 'balanced']}
            elif self.model_des == "Random Forest Model":
                self.params = {'n_estimators':[100,200,500], 'criterion':['gini','entropy'], 'max_depth':[None], 'min_samples_split':[2,6],"max_features":["auto","log2"],
                               'oob_score':[True,False],'warm_start':[True,False],'ccp_alpha':[0.0,0.5,1]}
            elif self.model_des == "Boosting Model": 
                self.params = {"learning_rate": [0.05, 0.25, 0.5, 0.75, 1], 'base_estimator':[DecisionTreeClassifier(max_depth=1),DecisionTreeClassifier(max_depth=2),DecisionTreeClassifier(max_depth=3),DecisionTreeClassifier(max_depth=4),DecisionTreeClassifier(max_depth=5)],
                               'algorithm':['SAMME'],"n_estimators":[100,200,500,1000]}
        else:
            self.params = params

        # setup the gridsearch
        self.grid = GridSearchCV(self.model, self.params, verbose=1, cv=KFold(
            self.folds, shuffle=self.shuffle, random_state=66))
        self.grid.fit(self.X, self.y)
        self.gsc = self.grid.best_score_
        self.best = self.grid.best_params_
        self.model = self.grid.best_estimator_
        self.model_des = self.model_des + " Grid Search:"
        try:
            self.coef = self.grid.best_estimator_.coef_
        except:
            pass
        # Check test score for grid
        try:
            self.sct = self.grid.best_estimator_.score(
                self.X_test, self.y_test)
            self.sctp = str(round(self.sct, 4))+" - better than baseline by " + \
                str(round(self.sct-self.baseline, 4))
        except:
            self.sctp = None
        # Print Grid results
        print(self.BOLD + self.model_des + self.END)
        print(self.BOLD + "Best Mean CV Model Score:" + self.END, round(self.gsc, 4), "- which is better than baseline by",
              round(self.gsc-self.baseline, 4), self.BOLD + "\nModel Test Score:" + self.END, self.sctp)
        print(self.BOLD + 'Grid Best Parameters:\n' + self.END, self.best)
        print(self.BOLD + '\nSearch Parameters:\n' + self.END, self.params)
        self.coefs()

    def matrix_n_graphs(self):
        print(self.BOLD + self.model_des, "on X_test" + self.END)
        self.y_pred = self.model.predict(self.X_test)
        skplt.metrics.plot_confusion_matrix(
            self.y_test, self.y_pred, figsize=(8, 8), labels=[0, 1, 2, 3], normalize=True)
        plt.ylim([-0.5, len(self.y.unique())-0.5])
        plt.show()
        cmap = ListedColormap(sns.color_palette("husl", 3))
        skplt.metrics.plot_roc(self.y_test, self.model.predict_proba(self.X_test), plot_micro=False,
                               plot_macro=False, title_fontsize=20, text_fontsize=16, figsize=(8, 8), cmap=cmap)
        plt.show()
        fig, ax = plt.subplots(figsize=(8, 8))
        skplt.metrics.plot_precision_recall(self.y_test, self.model.predict_proba(
            self.X_test), plot_micro=False, title_fontsize=20, text_fontsize=16, cmap=cmap, ax=ax)
        ax.legend(loc=[1.1, 0])
        plt.show()
