In [None]:
# compare algorithms
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as mpl
import numpy as np
from pandas_profiling import ProfileReport

import os, timeit
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder, normalize
from sklearn.impute import SimpleImputer 
from tpot import TPOTRegressor, TPOTClassifier

from sklearn.model_selection import RepeatedStratifiedKFold


# Describe independent and dependent variables

def insert_section(n=2):
    print('\n'*n)
    print('-----------------------------------------------')

def samplesize(dataset, n=1000):
    if dataset.shape[0] > n :
        sample=n
    else :
        sample = dataset.shape[0]
    return sample

def get_label_info(dataset, varlist):
    # Get cardinality of each variable 
    for var in varlist:
        print('\n\n')
        print("Number of levels in category '{0}': \b {1:2.2f} ".format(var, dataset[var].unique().size))
        if dataset[var].unique().size < 10 :
                print("Levels for catgeory '{0}': {1}".format(var, dataset[var].unique()))


def encode_decode_frame(data):
    from collections import defaultdict
    from sklearn.preprocessing import LabelEncoder

    encoder_dict = defaultdict(LabelEncoder)
    encoded_data = data.apply(lambda x: encoder_dict[x.name].fit_transform(x))

    inverse_transform_lambda = lambda x: encoder_dict[x.name].inverse_transform(x)
    labeled_data = encoded_data.apply(inverse_transform_lambda)
    
    return encoded_data, labeled_data


class MultiColumnLabelEncoder:
    
    def __init__(self, columns=None):
        self.columns = columns # array of column names to encode
    
    def fit(self, X, y=None):
        self.encoders = {}
        columns = X.columns if self.columns is None else self.columns
        for col in columns:
            self.encoders[col] = LabelEncoder().fit(X[col])
        return self
    
    def transform(self, X):
        output = X.copy()
        columns = X.columns if self.columns is None else self.columns
        for col in columns:
            output[col] = self.encoders[col].transform(X[col])
        return output
    
    def fit_transform(self, X, y=None):
        return self.fit(X,y).transform(X)
    
    def inverse_transform(self, X):
        output = X.copy()
        columns = X.columns if self.columns is None else self.columns
        for col in columns:
            output[col] = self.encoders[col].inverse_transform(X[col])
        return output


def num_str_cols(df):
    numeric_cols = [] # could still have ordinal data
    string_cols = []  # could have ordinal or nominal data
    for col in df.columns:
        if (df.dtypes[col] == np.int64 or df.dtypes[col] == np.int32 or df.dtypes[col] == np.float64):
            numeric_cols.append(col)      # True integer or float columns

        if (df.dtypes[col] == np.object):  # Nominal and ordinal columns
            string_cols.append(col)
    return numeric_cols, string_cols

In [None]:
# Load dataset
dataset = pd.read_csv("mushrooms.csv")
os.chdir("C:\\Users\\manka\\Documents\\GitHub\\Machine-learning-quickbook")
ind = ['cap-shape','cap-surface','cap-color','bruises','odor','gill-attachment',
       'gill-spacing','gill-size','gill-color','stalk-shape','stalk-root',
       'stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring',
       'stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type',
       'spore-print-color','population','habitat']
dep = ['class']

# EDA
display(print('Display datatypes of the data'))
display(dataset.dtypes)

display(print('Null profile of the data below'))
display(dataset.isnull().sum())

display(insert_section())

get_label_info(dataset, ind)

n = samplesize(dataset,1000)
print(n)

# report
# profile = ProfileReport(dataset.sample(n))
# profile.to_file("report.html")

In [None]:
# select X matrix as all but one as outcome variable
# X = dataset.reindex(columns=[x for x in dataset.columns.values if x != 'class'])        # separate out X
X = dataset.reindex(columns=ind)        # separate out X
y = dataset.reindex(columns=dep)        # separate out y


# Fixing X matrix 
# Dealing with string variables (simplest form of imputation for missing as acreating a new label)
X_string = X[num_str_cols(X)[1]]
X_string = X_string.fillna("missing")

# Dealing with numeric variables
n_imputer = SimpleImputer(missing_values='NaN', copy = True, strategy = 'most_frequent') # imputing with most frequent because some of these numeric columns are ordinal
X_numeric = X[num_str_cols(X)[0]]
if X_numeric.shape[1] > 0:
    X_numeric = n_imputer.fit_transform(X_numeric)
    X_numeric = pd.DataFrame(X_numeric, columns = numeric_cols)
else:
    pass


X_string = X_string.apply(LabelEncoder().fit_transform) 
X = pd.concat([X_numeric, X_string], axis=1)

In [None]:
# Encoding the data for further use 
display(X.info())
display(y.info())

# Finalizaing encoced X and Y matrix
Y = y
yle = LabelEncoder() # Need to ravel to make (1,) matrix
yle.fit(Y)
y_encoded = yle.transform(Y)

# get the orginal labels bacl
# display(yle.classes_)
# display(yle.inverse_transform(y_encoded))

xle = MultiColumnLabelEncoder()
xle.fit(X)
X_encoded = xle.transform(X)

# View encoded data 
display(X_encoded)
display(y_encoded)
display(X_encoded.shape)
display(y_encoded.shape)

In [None]:
# Train test splot using only encoded X and Y matrix
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size = .25, random_state = 55)

display(X_train)
display(y_train)


In [None]:
# instantiate tpot 
tpot = TPOTRegressor(verbosity=2,  
                    random_state=1, 
                    scoring='f1',
                    periodic_checkpoint_folder="intermediate_results",
                    n_jobs=5, 
                    warm_start = False,
                    generations=100, 
                    population_size=250,
                    early_stop=5)
times = []
scores = []
winning_pipes = []

# run 2 iterations
for x in range(1):
    start_time = timeit.default_timer()
    tpot.fit(X_train, y_train)
    elapsed = timeit.default_timer() - start_time
    times.append(elapsed)
    winning_pipes.append(tpot.fitted_pipeline_)
    scores.append(tpot.score(X_test, y_test))
    tpot.export(str(x)+'_tpot_reg.py')

# output results
times = [time/60 for time in times]
print('Times:', times)
print('Scores:', scores)   
print('Winning pipelines:', winning_pipes)

In [None]:
# instantiate tpot 
tpot = TPOTClassifier(verbosity=2,
                      random_state=1,
                      scoring='f1',
                      periodic_checkpoint_folder="intermediate_results",
                      n_jobs=5,
                      warm_start = False,
                      generations=100, 
                      population_size=250,
                      early_stop=5)
times = []
scores = []
winning_pipes = []

# run 2 iterations
for x in range(1):
    start_time = timeit.default_timer()
    tpot.fit(X_train, y_train)
    elapsed = timeit.default_timer() - start_time
    times.append(elapsed)
    winning_pipes.append(tpot.fitted_pipeline_)
    scores.append(tpot.score(X_test, y_test))
    tpot.export(str(x)+'_tpot_clf.py')

# output results
times = [time/60 for time in times]
print('Times:', times)
print('Scores:', scores)   
print('Winning pipelines:', winning_pipes)

In [None]:
# Average CV score on the training set was: 1.0
from sklearn.ensemble import RandomForestClassifier
exported_pipeline = RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.3, min_samples_leaf=13, min_samples_split=19, n_estimators=100)
# Fix random state in exported estimator
if hasattr(exported_pipeline, 'random_state'):
    setattr(exported_pipeline, 'random_state', 1)

exctracted_best_model = tpot.fitted_pipeline_.steps[-1][1]
exctracted_best_model.feature_importances_

# plot feature importance 
c = sns.color_palette("muted", 3)[2]
sns.barplot(x=X.columns.values, y=exctracted_best_model.feature_importances_, color=c)
mpl.xticks(rotation=90)
mpl.tight_layout()





In [None]:
# Get all sorts of classification score matrices
from sklearn.metrics import accuracy_score, balanced_accuracy_score, cohen_kappa_score, \
    confusion_matrix, classification_report, hamming_loss, average_precision_score, \
    f1_score, fbeta_score, precision_recall_fscore_support,recall_score, precision_score, \
    precision_recall_curve, roc_auc_score, roc_curve, auc

exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(X_test)

prediction = exported_pipeline.predict(X_test)

print("Accuracy Score : " , accuracy_score(y_test, prediction))
print("Balanced Accuracy Score : " , balanced_accuracy_score(y_test, prediction))
print("Cohen's Kappa Score : " , cohen_kappa_score(y_test, prediction))
print("Confusion Matrix Score : " , confusion_matrix(y_test, prediction, normalize='all'))
print("Classification report Score : " , classification_report(y_test, prediction, target_names=yle.classes_))
print("Hammings Loss Score : " , hamming_loss(y_test, prediction))
print("Average Precision Score : " , average_precision_score(y_test, prediction))
print("F1 Score : " , f1_score(y_test, prediction))
print("F-beta Score : " , fbeta_score(y_test, prediction, beta = 0)) # beta < 0 precision weight;  beta > 0 recall weight :  max(abs(beta)) == 1 
print("Precision Recall Fscore Support Score : " , precision_recall_fscore_support(y_test, prediction, beta = 0, average = 'micro')) # average [None (default), ‘binary’, ‘micro’, ‘macro’, ‘samples’, ‘weighted’]
print("Recall Score : " , recall_score(y_test, prediction))
print("Precision Score : " , precision_score(y_test, prediction))

# adding ROC curves 
try:
    probas_ = tpot.predict_proba(X_test)[:, 1]
except AttributeError:
    probas_ = tpot.decision_function(X_test)

print(roc_auc_score(y_test, probas_))
print("Precision Score : ", precision_recall_curve(y_test, probas_))



In [None]:
# Compute ROC curve and ROC area for each class
n_classes = 1
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), probas_.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
print(exported_pipeline)

In [None]:
y_train.shape

In [None]:
num_str_cols(X)

In [None]:
X[num_str_cols(X)[1]]


In [None]:
y_encoded