In [1]:
import sys
import os
import re
import warnings
import pickle
import datetime as dt
from time import time
from pprint import pprint

import pandas
from pandas.api.types import is_numeric_dtype
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler

from sklearn.datasets import make_moons, make_circles, make_classification, load_iris

if not sys.warnoptions:
    warnings.simplefilter("ignore")



In [2]:
# Load iris data to pandas dataframe

iris = load_iris()
temp = iris.target.tolist()
names = [iris.target_names[i] for i in temp]

iris_df = pandas.DataFrame(data=iris['data'],columns= iris['feature_names'])
iris_df['Plant_name'] = names

iris_df.head()

In [3]:
# Check the target (label) categories:

iris_df['Plant_name'].unique()

<tr>
<td> <img src="setosa.jpg" alt="Drawing" style="width: 250px;"/> </td>
<td> <img src="versicolor.jpg" alt="Drawing" style="width: 250px;"/> </td>
<td> <img src="virginica.jpg" alt="Drawing" style="width: 250px;"/> </td>
</tr>

In [4]:
# feature data types
iris_df.dtypes

In [5]:
# Vectorize the dataframe by converting object(string) columns to integer
def vectorize_dataframe(df, column_index_str):
    
    for column_name, column in df.transpose().iterrows():
        indexed_col_name = '{}{}'.format(column_index_str, column_name)
        if df[column_name].dtype == 'object':
            df[indexed_col_name] = df[column_name].astype('category')
            df[indexed_col_name] = df[indexed_col_name].cat.codes                
        elif is_numeric_dtype(df[column_name]):
            df[indexed_col_name] = df[column_name]
            
    new_df = df.filter(regex='{}*'.format(column_index_str))
    
    return new_df


vect_df = vectorize_dataframe(iris_df, 'indexed_')

In [9]:
print vect_df.dtypes
vect_df.head()

In [10]:
# train, test and predict

def printTable(myDict, colList=None):
    if not colList: 
        colList = list(myDict[0].keys() if myDict else [])
    myList = [colList] # 1st row = header
    for item in myDict: 
        myList.append([str(item[col] or '') for col in colList])
    colSize = [max(map(len,col)) for col in zip(*myList)]
    formatStr = ' | '.join(["{{:<{}}}".format(i) for i in colSize])
    myList.insert(1, ['-' * i for i in colSize]) # Seperating line
    for item in myList: 
        print(formatStr.format(*item))    
    
def test_classifiers_on_dataset(classifiers, X_train, y_train, X_test, y_test):

    # iterate over classifiers
    results=[]
    predictions = {}
    for clf in classifiers:
        
        name = str(clf).split('(')[0]
        
        t0 = time()

        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        pred = clf.predict(X_test)
        
        try:
            results.append({'Classifier':name,'Accuracy':round(score, 4),' Time':'{}s'.format(round(time() - t0, 3))})
            predictions[name] = pred
        except:
            pass
        
    printTable(results)
    return predictions


In [11]:
# Split the dataset to features & labels(targets) and after that into training and test sets

labels = vect_df['indexed_Plant_name']
features = vect_df.drop(['indexed_Plant_name'], axis=1)

features_train,features_test,labels_train,labels_test = train_test_split(features, labels, test_size=0.3)

In [13]:
classifiers = [
        KNeighborsClassifier(3),
        SVC(gamma=10, C=10000, kernel="rbf"),
        DecisionTreeClassifier(max_depth=5, min_samples_split=3),
        RandomForestClassifier(max_depth=4, n_estimators=10, max_features = 4), 
        AdaBoostClassifier(algorithm='SAMME', random_state=10, learning_rate=0.3, n_estimators=10),
        GaussianNB(),
        QuadraticDiscriminantAnalysis(),
        MLPClassifier(alpha=0.2),
        ]

predictions = test_classifiers_on_dataset(classifiers, features_train, labels_train, features_test, labels_test)

Classifier                    |  Time  | Accuracy
----------------------------- | ------ | --------
KNeighborsClassifier          | 0.003s | 0.9556  
SVC                           | 0.003s | 0.9556  
DecisionTreeClassifier        | 0.001s | 0.9778  
RandomForestClassifier        | 0.025s | 0.9778  
AdaBoostClassifier            | 0.022s | 0.9778  
GaussianNB                    | 0.001s | 0.9778  
QuadraticDiscriminantAnalysis | 0.001s | 1.0     
MLPClassifier                 | 0.102s | 1.0     


In [18]:
#Prediction review:

clasifier = 'AdaBoostClassifier'

try:
    del review_df
except:
    pass

review_df = features_test
review_df['Plant_name'] = labels_test
review_df['Predictions'] = predictions[clasifier].tolist()
# review_df

In [19]:
# map the numbers back to plant names ...

def map_values(row, values_dict):
    return values_dict[row]

values_dict = {0: 'setosa', 1: 'versicolor', 2: 'virginica'};

review_df['Plant_name'] = review_df['Plant_name'].apply(map_values, args = (values_dict,))
review_df['Predictions'] = review_df['Predictions'].apply(map_values, args = (values_dict,))
review_df

In [145]:
# Data visualization #1


# Parameters
n_classes = 3
plot_colors = "ryb"
plot_step = 0.02

# Load data
iris = load_iris()

for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3],
                                [1, 2], [1, 3], [2, 3]]):
    # We only take the two corresponding features
    X = iris.data[:, pair]
    y = iris.target

    # Train
    clf = DecisionTreeClassifier().fit(X, y)

    # Plot the decision boundary
    plt.subplot(2, 3, pairidx + 1)

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)

    plt.xlabel(iris.feature_names[pair[0]])
    plt.ylabel(iris.feature_names[pair[1]])

    # Plot the training points
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y == i)
        plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
                    cmap=plt.cm.RdYlBu, edgecolor='black', s=15)

plt.suptitle("Decision surface of a decision tree using paired features")
plt.legend(loc='lower right', borderpad=0, handletextpad=0)
plt.axis("tight")
plt.show()

In [143]:
# Data visualization #2   (randomly created 2-dimensional dataset)


def test_classifiers_on_3_datasets_with_plots():
    h = .02  # step size in the mesh

    names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
             "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
             "Naive Bayes", "QDA"]

    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        GaussianProcessClassifier(1.0 * RBF(1.0)),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        MLPClassifier(alpha=1),
        AdaBoostClassifier(),
        GaussianNB(),
        QuadraticDiscriminantAnalysis()]

    X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                               random_state=1, n_clusters_per_class=1)
    rng = np.random.RandomState(2)
    X += 2 * rng.uniform(size=X.shape)
    linearly_separable = (X, y)

    datasets = [make_moons(noise=0.3, random_state=0),
                make_circles(noise=0.2, factor=0.5, random_state=1),
                linearly_separable
                ]

    figure = plt.figure(figsize=(27, 9))
    i = 1
    # iterate over datasets
    for ds_cnt, ds in enumerate(datasets):
        # preprocess dataset, split into training and test part
        X, y = ds
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=.4, random_state=42)

        x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
        y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))

        # just plot the dataset first
        cm = plt.cm.RdBu
        cm_bright = ListedColormap(['#FF0000', '#0000FF'])
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        if ds_cnt == 0:
            ax.set_title("Input data")
        # Plot the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
                   edgecolors='k')
        # and testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
                   edgecolors='k')
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        i += 1

        # iterate over classifiers
        for name, clf in zip(names, classifiers):
            ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_train)
            print 'Dataset: {}, Classifier: {}, Accuracy: {}'.format(ds_cnt, name, score)

            # Plot the decision boundary. For that, we will assign a color to each
            # point in the mesh [x_min, x_max]x[y_min, y_max].
            if hasattr(clf, "decision_function"):
                Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
            else:
                Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

            # Plot also the training points
            ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
                       edgecolors='k')
            # and testing points
            ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                       edgecolors='k', alpha=0.6)

            ax.set_xlim(xx.min(), xx.max())
            ax.set_ylim(yy.min(), yy.max())
            ax.set_xticks(())
            ax.set_yticks(())
            if ds_cnt == 0:
                ax.set_title(name)
            ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                    size=15, horizontalalignment='right')
            i += 1

    plt.tight_layout()
    plt.show()

test_classifiers_on_3_datasets_with_plots()


