In [73]:
#traditional stuff
%matplotlib inline 
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

Functions that we generally need for cross-validation and plotting...

In [112]:
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
def cv_optimise(clf, Xtrain, Ytrain, params, n_folds = 5):
        gs = GridSearchCV(clf, param_grid = params, cv = n_folds)
        gs.fit(Xtrain, Ytrain)
        print ("BEST PARAMS", gs.best_params_)
        best = gs.best_estimator_
        return best
def do_classify(clf, params, df, features, target_name, target_value, mask = None, standardise = False, 
                train_size = 0.8):
    #targetname is one of the features only
    subdf = df[features]
    #standarisation and normalisation are two different ways to bring the data to scale
    if standardise:
        subdf = (subdf - subdf.mean()) / subdf.std()
    else:
        subdf = subdf
    X = subdf.values
    y = (subdf[target_name].values == target_value) * 1
    Xtrain, xtest, Ytrain, ytest = train_test_split(X, y, train_size = train_size)
    clf = cv_optimise(clf, Xtrain, Ytrain, params)
    #after finding the best classifier with best params, we again fit the train and the test data 
    clf = clf.fit(Xtrain, Ytrain)
    training_accuracy = clf.score(Xtrain, Ytrain)
    testing_accuracy = clf.score(Xtest, ytest)
    print ("Accuracy on training data: %0.2f" % (training_accuracy))
    print ("Accuracy on test data:     %0.2f" % (test_accuracy))
    return clf, Xtrain, ytrain, Xtest, ytest

In [113]:
from matplotlib.colors import ListedColormap
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

def points_plot(ax, Xtrain, Xtest, Ytrain, Ytest, clf, mesh = True, colorscale=cmap_light,
                cdiscrete=cmap_bold, alpha=0.1, psize=10, zfunc=False, predicted=False):
    h = .02
    #x has heights and weights 
    X = np.concatenate(Xtrain, Xtest)
    #heights max and min
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    #to create points for the meshgrid
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
    if zfunc:
        p0 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]
        p1 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        z = zfunc(p0, p1)
    else:
        z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    ZZ = z.reshape(xx.shape)
    if mesh:
        plt.pcolormesh(xx, yy, ZZ, cmap = cmap_light, alpha = alpha, axes = ax)
    if predicted:
        #if predicted is true show the predicted values on the map
        showtr = clf.predict(Xtrain)
        showte = clf.predict(Xtest)
    else:
        #else just show the true values 
        showtr = Ytrain
        showte = Ytest    
    #s is the area of one data point irrespective of the shape   
    #and c is the color list
    ax.scatter(Xtr[:, 0], Xtr[:, 1], c=showtr-1, cmap=cmap_bold, s=psize, alpha=alpha,edgecolor="k")
    ax.scatter(Xte[:, 0], Xte[:, 1], c=showte-1, cmap=cmap_bold, alpha=alpha, marker="s", s=psize+10)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    return ax,xx,yy

def points_plot_proba(ax, Xtrain, Xtest, Ytrain, Ytest, clf, colorscale = cmap_light,
                     cdiscrete = cmap_bold, ccolor = cm, psize = 10, alpha = 0.1):
    #get the mesh points
    ax, xx, yy = points_plot(ax, Xtrain, Xtest, Ytrain, Ytest, clf, mesh = False, colorscale = colorscale,
                            cdiscrete = cdiscrete, alpha = alpha, psize = psize)
    z = clf.predict_proba(np.c_(xx.ravel(), yy.ravel()))
    z = z.reshape(xx.shape)
    plt.contourf(xx, yy, z, cmap = ccolor, alpha = .2, axes = ax)
    cs2 = plt.contour(xx, yy, Z, cmap=ccolor, alpha=.6, axes=ax)
    plt.clabel(cs2, fmt = '%2.1f', colors = 'k', fontsize=14, axes=ax)
    return ax

### Bayes Theorem 

Bayes theorm is simply read as - 
 
** The probability of posterior is directly proposional to the product of liklihood and the prior** posterior being the probability when we have the data and prior being the probabilty which we estimate before hand(using different distributions like Beta, Gamma distributions)

For more info - "Think Bayes" is a very nice book!

In [146]:
data = [
(27450, 27450, 29420,"10/10/2016"),
(29420 , 36142, 29420, "10/10/2016"),
(11 , 11, 27450, "10/10/2016")] 

#Create DataFrame base
df = pd.DataFrame(data, columns=("User_id","Actor1","Actor2", "Time"))
df["Col1"] = [1 if i in df["Actor1"].values else 0 for i in df["User_id"].values]
df["Col2"] = [df.iloc[i]["Actor2"] if j == 1 else df.iloc[i]["Actor1"] for i, j in enumerate(df["Col1"].values)]

In [148]:
print (df)

   User_id  Actor1  Actor2        Time  Col1   Col2
0    27450   27450   29420  10/10/2016     1  29420
1    29420   36142   29420  10/10/2016     0  36142
2       11      11   27450  10/10/2016     1  27450
