In [None]:
%pylab inline
%precision 6
%load_ext line_profiler
%load_ext autoreload
%autoreload 1

In [None]:
import pandas as pd
import sklearn as skl
import sklearn

In [None]:
from common.feature_transformations import get_one_hot_encoding, get_probability_encoding
from common.visualize.method import print_decision_tree
from common.classes.Struct import Struct
from common.functions import all_nums, vec, row
from pdb import set_trace as bp

In [None]:
pd.options.display.max_colwidth=100
np.set_printoptions(linewidth=140,edgeitems=10)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
rcParams['figure.figsize'] = (8.0, 5.0)

# Pandas demo

In [None]:
A=pd.DataFrame(array([[1,2],[3,4]]),index=['First row','Second row'],columns=['feature1',
                                                                            'feature2'])

In [None]:
A.index

In [None]:
A.columns

In [None]:
A.loc['Second row']

In [None]:
A.head()

In [None]:
A.feature2

## Global workflow:

1. load data
1. explore data
1. clean data
    * remove constant features
    * remove features with low variation
    * remove not relevant features
    * remove not relevant objects (that are too old or follow a wrong model)
1. deal with missing values
    * remove?
    * fill with mean/median/mode?
    * predict using other features?  
1. filter outliers 
1. transform features
    * discrete features to one-hot-encoding or probability encoding
    * scaling 
    * non-linear transform (e.g. log)
    * make features as functions of other features
1. for each model class find optimal model parameters on cross-validation
1. compare models (each with its best parameters)
1. select best model and refit it using all available data
1. apply best model to new data

# Feature description

In [None]:
columns = 'A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 y'.split()
Z = pd.read_csv(r'credit_approval.dat', names=columns, skipinitialspace=True, sep=' ')
Z.head()

In [None]:
unique(Z.y,return_counts=1)

In [None]:
Z.isnull().sum().sum()

In [None]:
all_nums(Z)

## Data preparation

### Dealing with discrete features

In [None]:
F=Struct() # features
F.original = list(Z.columns)
F.original.remove('y')

In [None]:
F.original

In [None]:
F.continious = 'A2 A3 A7 A10 A13 A14'.split()
F.discrete = 'A1 A4 A5 A6 A8 A9 A11 A12'.split()

In [None]:
A=pd.DataFrame(array([[1,2],[3,4]]),index=['First row','Second row'],columns=['feature1',
                                                                            'feature2'])

In [None]:
A.index

In [None]:
A.columns

In [None]:
A.loc['Second row']

In [None]:
F.discrete_one_hot = []
for col in F.discrete:
    print('Making one-hot-encoding of %s'%col)
    feature_one_hot = get_one_hot_encoding(Z[col])
    Z = pd.concat([Z, feature_one_hot],axis=1)
    F.discrete_one_hot += list(feature_one_hot.columns)

In [None]:
F.discrete_one_hot[:10]

In [None]:
F.discrete_prob = []
for col in F.discrete:
    print('Making probability-encoding of %s'%col)
    feature_prob = get_probability_encoding(Z[col], Z.y)
    Z = pd.concat([Z, feature_prob],axis=1)
    F.discrete_prob += list(feature_prob.columns)    

In [None]:
F.discrete_prob[:10]

In [None]:
F.continious_norm = []
for col in F.continious:
    Z[col+'_norm'] = (Z[col]-mean(Z[col]))/std(Z[col])
    F.continious_norm.append(col+'_norm')
    print('Normalized %s'%col)

In [None]:
F.continious_norm = []
for col in F.continious:
    Z[col+'_norm'] = (Z[col]-mean(Z[col]))/std(Z[col])
    F.continious_norm.append(col+'_norm')
    print('Normalized %s'%col)

In [None]:
F.continious_norm_range = []
for col in F.continious:
    Z[col+'_norm_range'] = (Z[col]-min(Z[col]))/(max(Z[col])-min(Z[col]))
    F.continious_norm.append(col+'_norm')
    print('Range-normalized %s'%col)

## Converting types to more convenient

In [None]:
features = list(Z.columns)
features.remove('y')

Z[features] = Z[features].astype(float32)
Z.y = Z.y.astype(int)  

## Splitting on train/test set

In [None]:
len(Z)

In [None]:
N=len(Z)
Z.index=arange(N)

inds = arange(N)
np.random.seed(0)
np.random.shuffle(inds)

N1 = int(0.6*N)

train_inds = inds[:N1]
test_inds = inds[N1:]

In [None]:
train_sels = zeros(N,dtype=bool)
test_sels = zeros(N,dtype=bool)

train_sels[train_inds] = True
test_sels[test_inds] = True

### Show decision tree

In [None]:
features = F.continious

X_train = Z.loc[train_inds,features].values
Y_train = Z.loc[train_inds,'y'].values

X_test = Z.loc[test_inds,features].values
Y_test = Z.loc[test_inds,'y'].values

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier(max_depth=1)

In [None]:
clf.fit(X_train,Y_train)

In [None]:
clf.predict(X_test[:15])

In [None]:
clf.predict_proba(X_test[:15])

In [None]:
clf.classes_

In [None]:
clf.feature_importances_ 

In [None]:
features[argmax(clf.feature_importances_ )]

### Plotting decision trees

In [None]:
print_decision_tree(clf, feature_names=F.continious, class_names=['reject','approve'])

In [None]:
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X_train,Y_train)

print_decision_tree(clf, feature_names=F.continious, class_names=['reject','approve'])

In [None]:
from sklearn.externals.six import StringIO
with open("decision_tree.dot", 'w') as f:
    f = skl.tree.export_graphviz(clf, out_file=f, feature_names=F.continious, class_names=['reject','approve'], 
                                 label='none',
                                 filled=True, 
                                 impurity=0,
                                 rounded=True)

In [None]:
# change path to where "decision_tree.dot" was saved
# run "dot -Tpdf decision_tree.dot -o decision_tree.pdf" <- need to install GraphViz dot tool for that.

In [None]:
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X_train,Y_train)
Y_hat = clf.predict(X_test)

## K-nearest neighbour

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
clf = KNeighborsClassifier()

In [None]:
clf.fit(X_train,Y_train)

In [None]:
clf.predict(X_test[:10])

In [None]:
clf.predict_proba(X_test[:5])

In [None]:
clf.classes_

### Specific methods

In [None]:
clf.kneighbors(row(X_train[0]), return_distance=0)

In [None]:
clf.kneighbors(X_train[:2], return_distance=0)

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(C=1)

In [None]:
clf.fit(X_train,Y_train)

In [None]:
clf.predict(X_test[:5])

In [None]:
clf.predict_proba(X_test[:5])

In [None]:
clf.classes_

### Specific methods

In [None]:
clf.intercept_, clf.coef_, 

## Quality evaluation

In [None]:
clf = DecisionTreeClassifier(max_depth=5)

In [None]:
clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2)

In [None]:
clf = LogisticRegression(C=1)

In [None]:
clf.fit(X_train,Y_train)
Y_hat = clf.predict(X_test)
P_hat = clf.predict_proba(X_test)[:,1]

In [None]:
skl.metrics.accuracy_score(Y_test, Y_hat)

In [None]:
sklearn.metrics.confusion_matrix(Y_test, Y_hat)
# By definition a confusion matrix C is such that C_{i, j} is equal to the number of observations known to be in group i 
# but predicted to be in group j.

#### Log-likelihood score

In [None]:
sum(log(P_hat))

#### Brier loss

In [None]:
skl.metrics.brier_score_loss(Y_test, P_hat)

#### ROC curve

* binary classification
* confusion matrix
* Bayes minimum cost decision rule
* ROC

In [None]:
(fpr, tpr, thresh) = skl.metrics.roc_curve(Y_test, P_hat)
plot(fpr,tpr)
xlabel('false positive rate')
ylabel('true positive rate')
title('ROC curve')

In [None]:
sklearn.metrics.auc(fpr,tpr)

In [None]:
sklearn.metrics.roc_auc_score(Y_test, P_hat)

## Optimizing parameters

In [None]:
from common.visualize.method import show_param_dependency

In [None]:
clf = DecisionTreeClassifier(max_depth=5)
show_param_dependency(clf, X_train, Y_train, param_name='max_depth', param_vals=[1,3,5,8,10,15,20,50], x_label='max depth of tree')

In [None]:
clf = KNeighborsClassifier(n_neighbors=5)
show_param_dependency(clf, X_train, Y_train, param_name='n_neighbors', param_vals=[1,3,5,8,10,15,20,50], x_label='n_neighbors for K-NN')

In [None]:
clf = LogisticRegression()
show_param_dependency(clf, X_train, Y_train, param_name='C', param_vals=10**linspace(-5,5,11), x_label='C for logistic regression')

## Log scale example

In [None]:
seq = linspace(-4,4,9)
10**seq

In [None]:
plot(10**seq,seq)

In [None]:
plot(10**seq,seq)
xscale('log')

In [None]:
clf = LogisticRegression()
show_param_dependency(clf, X_train, Y_train, param_name='C', param_vals=10**linspace(-5,5,11), x_label='C for logistic regression')
xscale('log')

## Grid search optimization

In [None]:
from sklearn.grid_search import GridSearchCV

In [None]:
clf = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2)
param_grid = {'n_neighbors':[1,3,5,8,10,15,20,50], 'p':[1,2,4,6]}
clf = GridSearchCV(clf, param_grid, n_jobs=-1, refit=True)

In [None]:
clf.fit(X_train,Y_train)
clf.best_score_, clf.best_params_

In [None]:
clf = DecisionTreeClassifier()
param_grid = {'criterion':['gini','entropy'], 'min_samples_leaf':[1,3,10,50,100,500]}
clf = GridSearchCV(clf, param_grid, n_jobs=-1, refit=True)

In [None]:
clf.fit(X_train,Y_train)
clf.best_score_, clf.best_params_

In [None]:
clf = LogisticRegression(C=1)
param_grid = {'C':10**linspace(-5,5,11)}
clf = GridSearchCV(clf, param_grid, n_jobs=-1, refit=True)

In [None]:
clf.fit(X_train,Y_train)
clf.best_score_, clf.best_params_

# Not explored here

#### Create new instance with the same parameters

In [None]:
clf2 = DecisionTreeClassifier()
clf2.set_params(**clf.best_params_)

* how adding information about discrete features improve classification?
* is probability encoding or one-hot encoding or both better?
* should we use continious features or normalized continious features?
* which type of normalization is better?
* are there any outliers?
* will our accuracy improve, if we exclude outliers from the training set?
* visualize features
* visualize dependency between output and features
* what ideas of feature transformations can we get from looking at these visualizations?
* can our data be clustered into different groups?
* can cluster information be a useful feature for classification?
* should we reweight observations while training?
* etc.