In [1]:
#traditional stuff
%matplotlib inline 
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

Functions that we generally need for cross-validation and plotting...

In [4]:
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
def cv_optimise(clf, Xtrain, Ytrain, params, n_folds = 5):
        gs = GridSearchCV(clf, param_grid = params, cv = n_folds)
        gs.fit(Xtrain, Ytrain)
        print ("BEST PARAMS", gs.best_params_)
        best = gs.best_estimator_
        return best
def do_classify(clf, params, df, features, target_name, target_value, mask = None, standardise = False, 
                train_size = 0.8):
    #targetname is one of the features only
    subdf = df[features]
    #standarisation and normalisation are two different ways to bring the data to scale
    if standardise:
        subdf = (subdf - subdf.mean()) / subdf.std()
    else:
        subdf = subdf
    X = subdf.values
    y = (subdf[target_name].values == target_value) * 1
    Xtrain, xtest, Ytrain, ytest = train_test_split(X, y, train_size = train_size)
    clf = cv_optimise(clf, Xtrain, Ytrain, params)
    #after finding the best classifier with best params, we again fit the train and the test data 
    clf = clf.fit(Xtrain, Ytrain)
    training_accuracy = clf.score(Xtrain, Ytrain)
    testing_accuracy = clf.score(Xtest, ytest)
    print ("Accuracy on training data: %0.2f" % (training_accuracy))
    print ("Accuracy on test data:     %0.2f" % (test_accuracy))
    return clf, Xtrain, ytrain, Xtest, ytest

In [5]:
from matplotlib.colors import ListedColormap
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

def points_plot(ax, Xtrain, Xtest, Ytrain, Ytest, mesh = True, Truecolorscale=cmap_light,
                cdiscrete=cmap_bold, alpha=0.1, psize=10, zfunc=False, predicted=False):
    h = .02
    #x has heights and weights 
    X = np.concatenate(Xtrain, Xtest)
    #heights max and min
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    #to create points for the meshgrid
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
    if zfunc:
        p0 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]
        p1 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        z = zfunc(p0, p1)
    else:
        z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    ZZ = z.reshape(xx.shape)
    if mesh:
        plt.pcolormesh(xx, yy, ZZ, cmap = cmap_light, alpha = alpha, axes = ax)
    if predicted:
        #if predicted is true show the predicted values on the map
        showtr = clf.predict(Xtrain)
        showte = clf.predict(Xtest)
    else:
        #else just show the true values 
        showtr = Ytrain
        showte = Ytest    
    ax.scatter(Xtr[:, 0], Xtr[:, 1], c=showtr-1, cmap=cmap_bold, s=psize, alpha=alpha,edgecolor="k")   