In [None]:
#Imports of all necessary Python Packages
import sklearn as skl
import numpy as np
import pandas as pd
from time import time

#Decomposition
from sklearn.decomposition import PCA


#Classifiers
from sklearn.svm import SVC
from sklearn.svm import NuSVC

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.dummy import DummyClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.linear_model import LogisticRegressionCV

from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
#Evaluation

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn import cross_validation
from sklearn.cross_validation import train_test_split

In [None]:
#Here import the data to classify
data = np.random.rand(100,5)
labels = np.random.randint(2,size=100)

In [None]:
# Feature Extraction
pca = PCA(n_components = 2)
data_decomposed = pca.fit_transform(data)

In [None]:
#Generate a Few of the most comon Scikit-Learn Classifiers
#SVM
classfSVM = SVC(C=1, kernel='rbf', gamma='auto', tol=1e-3)
#SVM with control of the %(nu) of support vectors 
classfNuSVM = NuSVC(nu=0.5, kernel='rbf', gamma='auto', tol=1e-3)
#LDA
classfLDA = LinearDiscriminantAnalysis()
#QDA
classfQDA = QuadraticDiscriminantAnalysis()
#KNN
classfKNN = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
#DecisionTree
classfDecisionTree = DecisionTreeClassifier()
#AdaBoost
classfAdaBoost = AdaBoostClassifier(base_estimator=None, learning_rate=1, n_estimators=50)
#RandomForest
classfRandForest = RandomForestClassifier(n_estimators=10, n_jobs=-1)

In [None]:
# Test a voting Classifier with the best performing ones.
estimators = [('lda', LinearDiscriminantAnalysis()),  ('logReg', LogisticRegressionCV()),
              ('adaB', AdaBoostClassifier(base_estimator=None, learning_rate=1, n_estimators=50))]
classfVC = VotingClassifier(estimators= estimators, voting='soft')

In [None]:
#Test classifier on all data or preferably on a Test set:
#DON'T trust these results, there is an high chance of overfitting
#Use ONLY to get a feel of the time Cross-Validation will take

classf = classfVC
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.20, random_state=42)

t0 = time()
classf.fit(train_data,train_labels)
print "Score of Classifier: " + str(classf.score(test_data,test_labels))
print("done in %0.3fs." % (time() - t0))

In [None]:
# Analyze Confusion Matrix
true_labels = labels
test_data = data
classf = classfSVM
predicted_labels = classf.predict(test_data)
confusion_matrix(true_labels, predicted_labels)

In [None]:
# Select parameters to use in Cross-Validation
classf_cv = classfVC
data_cv = data_decomposed
N_CV = 10

# Cross Validation
t0 = time()
scores = cross_validation.cross_val_score(classf_cv,data_cv,labels, n_jobs=-1, cv = N_CV)
print "Scores: "
for i,score in enumerate(scores):
    print '\t' + str(i) + ':\t' + str(score) 
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
print("\nCross val done in %0.3fs." % (time() - t0))

In [None]:
# Test differente classifiers:

classf_lst = [classfLDA, classfQDA, classfAdaBoost, classfSVM]

train_data, test_data, train_labels, test_labels = train_test_split(data_decomposed,
                                                                    labels, test_size=0.20, random_state=42)
for classf in classf_lst:
    print classf
    t0 = time()
    classf.fit(train_data,train_labels)
    print "Score of Classifier: " + str(classf.score(test_data,test_labels))
    print("done in %0.3fs." % (time() - t0))

In [None]:
# Fine Tune with Grid-Search

estimators = [('lda', LinearDiscriminantAnalysis()),  ('logReg', LogisticRegressionCV()),
              ('adaB', AdaBoostClassifier(base_estimator=None, learning_rate=1, n_estimators=50))]

pipeline = Pipeline([
    ('pca',PCA()),
    ('VC', VotingClassifier(estimators= estimators, voting='hard'))
])

# Fine tune parameters using exaustive GridSearch:

parameters = {
    'pca__n_components': (2,3,),
    'VC__adaB__n_estimators': (40,50,80)
    
    }
    
grid_search = GridSearchCV(pipeline, parameters,  verbose=1)


In [None]:
t0 = time()
grid_search.fit(data, labels)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))