# Machine Learning Engineer Nanodegree

## Project 5: Capstone

## 1.Load the Data

In [1]:
# Import libraries
import numpy as np
import pandas as pd


# Load training data
train_data = pd.read_csv("final_train_data.csv")
target_train = train_data['CFA']
feature_train = train_data.drop(['CFA','ID'], axis = 1)
print "Training data read successfully!"

Training data read successfully!


In [3]:
# Load validating data
validate_data = pd.read_csv("validating_data.csv")
target_validate = validate_data['CFA']
feature_validate = validate_data.drop(['CFA','ID'], axis = 1)
print "validating data read successfully!"

validating data read successfully!


In [5]:
# Load testing data
test_data = pd.read_csv("test_data.csv")
target_test = test_data['CFA']
feature_test = test_data.drop(['CFA','ID'], axis = 1)
print "testing data read successfully!"

testing data read successfully!


**Basic information of training and testing dataset.**

In [6]:
print 'Number of rows and columns for training data set:', feature_train.shape
print 'Number of rows and columns for validating data set:', feature_validate.shape
print 'Number of rows and columns for testing data set:', feature_test.shape

Number of rows and columns for training data set: (201606, 112)
Number of rows and columns for validating data set: (574, 112)
Number of rows and columns for testing data set: (674, 112)


## 2.Training and Evaluating Models


**The following supervised learning models are selected to use.**
- Gaussian Naive Bayes (GaussianNB)
- Decision Trees
- Logistic Regression
- Ensemble Methods (AdaBoost)

**Evaluation merits:  F1 score. F1 = 2 * (precision * recall) / (precision + recall)**

In [7]:
# import algorithm from sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.ensemble import AdaBoostClassifier as AB
from sklearn.linear_model import LogisticRegression as LR

from sklearn.metrics import f1_score
from time import time

**2.1 GaussianNB**

In [5]:
start = time()
clf = GaussianNB()
clf.fit(feature_train,target_train)
pred = clf.predict(feature_validate)
f1 = f1_score(target_validate, pred)
end = time()
print "F1 score for test set: {:.4f}.".format(f1)
print "Process in {:.4f} seconds.".format(end - start)
# full dataset with the score of 0.8285

F1 score for test set: 0.8213.
Process in 1.8210 seconds.


**2.2 Decision Tree**

In [6]:
start = time()
clf = DT()
clf.fit(feature_train,target_train)
pred = clf.predict(feature_validate)
f1 = f1_score(target_validate, pred)
end = time()
print "F1 score for test set: {:.4f}.".format(f1)
print "Process in {:.4f} seconds.".format(end - start)

F1 score for test set: 0.8402.
Process in 9.9120 seconds.


**2.3 AdaBoost**

In [7]:
start = time()
clf = AB()
clf.fit(feature_train,target_train)
pred = clf.predict(feature_validate)
f1 = f1_score(target_validate, pred)
end = time()
print "F1 score for test set: {:.4f}.".format(f1)
print "Process in {:.4f} seconds.".format(end - start)

F1 score for test set: 0.8449.
Process in 32.0410 seconds.


**2.4 Logistic Regression**

In [8]:
start = time()
clf = LR()
clf.fit(feature_train,target_train)
pred = clf.predict(feature_validate)
f1 = f1_score(target_validate, pred)
end = time()
print "F1 score for test set: {:.4f}.".format(f1)
print "Process in {:.4f} seconds.".format(end - start)

F1 score for test set: 0.8425.
Process in 3.4310 seconds.


## 3.Choosing the Best Model and Model Tuning


### Choosing the Best Model


**3.1 Decision Tree**

In [10]:
from sklearn.metrics import f1_score, make_scorer
from sklearn.grid_search import GridSearchCV

# Initialize the classifier
clf = DT()

# Create the parameters list you wish to tune
parameters = {'criterion': ('gini','entropy'),
              'splitter':('best','random'),
              'min_samples_split':[2,10,20],
                'max_leaf_nodes':[5,30,100]}



# Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score)

# Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf, parameters, scoring=f1_scorer)

#  Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(feature_train,target_train)

# Get the estimator
clf = grid_obj.best_estimator_

print clf

# Report the final F1 score for training and testing after parameter tuning
pred = clf.predict(feature_validate)
f1 = f1_score(target_validate, pred)
print "F1 score for test set: {:.4f}.".format(f1)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=100, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='random')
F1 score for test set: 0.8514.


**3.2 AdaBoost**

In [12]:
# Initialize the classifier
clf = AB()

# Create the parameters list you wish to tune
parameters = {
              "n_estimators": [1, 50, 100]
             }



# Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score)

# Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf, parameters, scoring=f1_scorer)

#  Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(feature_train,target_train)

# Get the estimator
clf = grid_obj.best_estimator_

print clf

# Report the final F1 score for training and testing after parameter tuning
pred = clf.predict(feature_validate)
f1 = f1_score(target_validate, pred)
print "F1 score for test set: {:.4f}.".format(f1)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)
F1 score for test set: 0.8478.


**3.3 Logistic Regression**

In [9]:
# Initialize the classifier
clf = LR()

# Create the parameters list you wish to tune
parameters = {'penalty':('l1','l2'),
              'C':[ 0.01,  0.1, 1.0, 10, 20]}



# Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score)

# Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf, parameters, scoring=f1_scorer)

#  Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(feature_train,target_train)

# Get the estimator
clf = grid_obj.best_estimator_

print clf

# Report the final F1 score for training and testing after parameter tuning
pred = clf.predict(feature_validate)
f1 = f1_score(target_validate, pred)
print "F1 score for test set: {:.4f}.".format(f1)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
F1 score for test set: 0.8420.


### Final F<sub>1</sub> Score and Robustness of the final classifier

**The best f1 score is 0.8514 with DecisionTreeClassifier** (class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=100, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='random')

In [11]:
clf = DT(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=100, min_samples_leaf=1, min_samples_split=10, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='random')
clf.fit(feature_train,target_train)

# calculate f1 for training
pred_train = clf.predict(feature_train)
f1_train = f1_score(target_train, pred_train)


#calculate f1 for testing
pred_test = clf.predict(feature_test)
f1_test = f1_score(target_test, pred_test)

print "F1 score for training set: {:.4f}.".format(f1_train)
print "F1 score for testing set: {:.4f}.".format(f1_test)


F1 score for training set: 0.8765.
F1 score for testing set: 0.8630.


### Comfusion matrix


In [12]:
from sklearn.metrics import confusion_matrix

clf_A = GaussianNB()

clf_B = DT(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=100, min_samples_leaf=1, min_samples_split=10, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='random')

clf_C = AB(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)

clf_D = LR(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

for clf in [clf_A, clf_B, clf_C, clf_D]:
    clf.fit(feature_train,target_train)
    pred = clf.predict(feature_test)
    matrix = confusion_matrix(target_test, pred)
    print matrix
    print '\n'


[[ 41 131]
 [ 40 462]]


[[ 37 135]
 [ 18 484]]


[[ 32 140]
 [  9 493]]


[[ 19 153]
 [ 10 492]]


