    # Automatic Jupyter Notebook for OpenML dataset 1: anneal

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import openml as oml
import numpy as np
import pandas as pd
from sklearn import dummy
from sklearn.model_selection import train_test_split
from matplotlib import cm
from matplotlib.ticker import FormatStrFormatter

plt.rcParams['figure.dpi']= 120
plt.rcParams['xtick.labelsize'] = 8
plt.rcParams['ytick.labelsize'] = 8 

from preamble import *
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from pymongo import MongoClient

The percentage of unique values for the default target attribute in this data set is 0.0056.
Because this is lower or equal than 5% of the dataset we assume that this is a **classification** problem. 

Calculate baseline accuracy for classification problems using scikit-learn DummyClassifier. 

In [None]:
def baseline(data):
    strategies = ['stratified','most_frequent','prior','uniform']
    baseDict = {}
    X, y, features = data.get_data(target=data.default_target_attribute, return_attribute_names=True); 
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    for strat in strategies:
        clf = dummy.DummyClassifier(strategy=strat,random_state=0)
        clf.fit(X_train, y_train)
        baseDict[strat] = clf.score(X_test, y_test)
    return baseDict  

Generates a plot of the classification baseline accuracy of the various baseline strategies using scikit-learn DummyClassifier.


In [None]:
def plot_baseline(scores):
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.ticker import MaxNLocator
    from collections import namedtuple

    strats = scores
    maxBaseline = strats[max(strats, key=strats.get)]
    
    n_groups = len(strats)

    fig, ax = plt.subplots()

    index = np.arange(n_groups)
    bar_width = 0.1

    opacity = 0.4
    error_config = {'ecolor': '0.3'}

    plt.bar(range(len(strats)), strats.values(), align='center')
    plt.xticks(range(len(strats)), list(strats.keys()))
    plt.yticks(np.arange(0, 1.1, step=0.2))
    plt.yticks(list(plt.yticks()[0]) + [maxBaseline])

    ax.set_ylim(ymin=0)
    ax.set_ylim(ymax=1)
    ax.set_xlabel('Baseline Strategy')
    ax.set_ylabel('Accuracy')
    ax.set_title('Baseline Performance Predicting Feature: ' + data.default_target_attribute)
    plt.axhline(y=maxBaseline, color='r', linestyle='--', label=maxBaseline)
    plt.gca().get_yticklabels()[6].set_color('red')
    fig.tight_layout()
    plt.show() 
    return maxBaseline 

Generates a plot of the accuracy of the machinelearning algorithms against the baseline.


In [None]:
def plot_alg(scores, maxBaseline):
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.ticker import MaxNLocator
    from collections import namedtuple

    strats = scores
    
    n_groups = len(strats)

    fig, ax = plt.subplots()

    index = np.arange(n_groups)
    bar_width = 0.1

    opacity = 0.4
    error_config = {'ecolor': '0.3'}

    barlist =plt.bar(range(len(strats)), strats.values(), align='center')
    plt.xticks(range(len(strats)), list(strats.keys()))
    plt.yticks(np.arange(0, 1.1, step=0.2))
    plt.yticks(list(plt.yticks()[0]) + [maxBaseline])

    ax.set_ylim(ymin=0)
    ax.set_ylim(ymax=1)
    ax.set_xlabel('Machine Learning Algorithm')
    ax.set_ylabel('Accuracy')
    ax.set_title('Algorithm Performance Predicting Feature: ' + data.default_target_attribute)
    plt.axhline(y=maxBaseline, color='r', linestyle='--', label=maxBaseline)
    plt.gca().get_yticklabels()[6].set_color('red')
    for bar in barlist:
        if bar.get_height() > maxBaseline:
            bar.set_facecolor('g') 
    fig.tight_layout()
    plt.show()  

Build Random Forest model from the dataset and compute important features. 

In [None]:
def build_forest(data):    
    X, y, features = data.get_data(target=data.default_target_attribute, return_attribute_names=True); 
    forest = Pipeline([('Imputer', preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)),
                       ('classifiers', RandomForestClassifier(n_estimators=100, random_state=0))])
    forest.fit(X,y)
    
    importances = forest.steps[1][1].feature_importances_
    indices = np.argsort(importances)[::-1]
    return data.name, features, importances, indices 

Plot Top-20 important features for the dataset. 

In [None]:
def plot_feature_importances(features, importances, indices):
    a = 0.8
    f_sub = []
    max_features = 20

    for f in range(min(len(features), max_features)): 
            f_sub.append(f)

    # Create a figure of given size
    fig = plt.figure(figsize=(6, 4))
    ax = fig.add_subplot(111)
    # Set title
    ttl = dataset_name

    df = pd.DataFrame(importances[indices[f_sub]][::-1])
    df.plot(kind='barh', ax=ax, alpha=a, legend=False, edgecolor='w', 
            title=ttl, color = [plt.cm.viridis(np.arange(len(df))*10)])

    # Remove grid lines and plot frame
    ax.grid(False)
    ax.set_frame_on(False)

    # Customize title
    ax.set_title(ax.get_title(), fontsize=14, alpha=a, ha='left', x=0, y=1.0)
    plt.subplots_adjust(top=0.9)

    # Customize x tick lables
    ax.xaxis.set_major_formatter(FormatStrFormatter('%.3f'))
    ax.locator_params(axis='x', tight=True, nbins=5)

    # Customize y tick labels
    yticks = np.array(features)[indices[f_sub]][::-1]
    ax.set_yticklabels(yticks, fontsize=8, alpha=a)
    ax.yaxis.set_tick_params(pad=2)
    ax.yaxis.set_ticks_position('none')  
    ax.set_ylim(ax.get_ylim()[0]-0.5, ax.get_ylim()[1]+0.5) 

    # Set x axis text
    xlab = 'Feature importance'
    ax.set_xlabel(xlab, fontsize=10, alpha=a)
    ax.xaxis.set_label_coords(0.5, -0.1)

    # Set y axis text
    ylab = 'Feature'
    ax.set_ylabel(ylab, fontsize=10, alpha=a)
    plt.show() 

Choose desired dataset and generate the most important plot. 

In [None]:
dataset = 1

In [None]:
data = oml.datasets.get_dataset(dataset)
dataset_name, features, importances, indices = build_forest(data)
plot_feature_importances(features, importances, indices)

Plot of the classification baseline acuracy of the various baseline strategies using scikit-learn DummyClassifier.

The target feature is: **class**

The following baseline strategies are used: stratified, most_frequent, prior, uniform.

The strategies work as follow according to the sciki-learn API:

- **stratified**: Generates predictions by respecting the training set’s class distribution.

- **most_frequent**: Always predicts the most frequent label in the training set. Also known as ZeroR.

- **prior**: Always predicts the class that maximizes the class prior. 

- **uniform**: Generates predictions uniformly at random.

The horizontal red dotted line denotes the baseline value for this dataset which is equal to the best performing baseline strategy.

[More information.](http://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html)


In [None]:
maxBaseline = plot_baseline(baseline(data))  

Runs the decision tree algorithm on the dataset WIP


In [None]:
#Runs the decision tree algorithm on the dataset
from sklearn import tree
#Running default values, it is recommended to experiment with the values of the parameters below. Try min_samples_leaf=5
clf = tree.DecisionTreeClassifier(max_depth=None, min_samples_leaf=1, max_features=None, max_leaf_nodes=None)
X, y, features = data.get_data(target=data.default_target_attribute, return_attribute_names=True); 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

p = len(features)
n = len(X_train)
#computational complexity O(n^2 * p)
complexity = n**2 * p

if complexity <= 50000000000000:
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    strats = baseline(data)
    strats['Decision Tree'] = acc 
else: 
    print("computation complexity too high, please run manually if desired.") 

Runs the multinomial naive bayes algorithm on the dataset WIP 

In [None]:
#Runs the Multinomial Naive Bayes algorithm on the dataset
from sklearn.naive_bayes import MultinomialNB
#Running default values, it is recommended to experiment with the values of the parameters below.
clf = MultinomialNB()
X, y, features = data.get_data(target=data.default_target_attribute, return_attribute_names=True); 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

p = len(features)
n = len(X_train)
#computational complexity O(n * p)
complexity = n * p

if complexity <= 50000000000000:
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    strats['naive bayes'] = acc 
else: 
    print("computation complexity too high, please run manually if desired.") 

Runs the random forest algorithm on the dataset WIP 

In [None]:
#Runs the Random Forest algorithm on the dataset
from sklearn.ensemble import RandomForestClassifier
import math
#Running default values, it is recommended to experiment with the values of the parameters below.
clf = RandomForestClassifier()
X, y, features = data.get_data(target=data.default_target_attribute, return_attribute_names=True); 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

p = len(features)
n = len(X_train)
#computational complexity O(n^2 * sqrt(p * n_trees))
complexity = n**2 * math.sqrt(p * 10)

if complexity <= 50000000000000:
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    strats['random forest'] = acc 
else: 
    print("computation complexity too high, please run manually if desired.") 

Runs the classification support vector algorithm on the dataset WIP


In [None]:
#Runs the Classification Support Vector Machine algorithm on the dataset
from sklearn import svm
#Running default values, it is recommended to experiment with the values of the parameters below.
clf = svm.SVC()
X, y, features = data.get_data(target=data.default_target_attribute, return_attribute_names=True); 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

p = len(features)
n = len(X_train)
#computational complexity O(n^2 * p + n^3)
complexity = n**2 * p + n**3

if complexity <= 50000000000000:
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    strats['support vector machine'] = acc 
else: 
    print("computation complexity too high, please run manually if desired.") 

Plot the accuracy of various machine learning algorithms against the baseline. 

In [None]:
plot_alg(strats, maxBaseline) 