In [56]:
import numpy as np
import mltools as ml
import sklearn.model_selection
from sklearn import *
from sklearn.metrics import roc_curve, roc_auc_score
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [36]:
combined_data = np.genfromtxt("combined_wine.csv", delimiter = ',')
combined_data = combined_data[~np.isnan(combined_data).any(axis=1)]
Y_combined = combined_data[:,-1]
X_combined = combined_data[:,0: -1]

#shuffle data
X, Y = ml.shuffleData(X_combined, Y_combined)
print(X.shape)

NUMBER_OF_FEATURES_WANTED = 6

(6497, 12)


https://machinelearningmastery.com/feature-selection-machine-learning-python/
Read this article

In [39]:
#Unvariate Selection

def unvariateSelection(number_of_features, precision = 3):
    test = SelectKBest(score_func = f_classif, k = number_of_features)
    fit = test.fit(X, Y)
    # summarize scores
    set_printoptions(precision=precision)
    print(fit.scores_)
    features = fit.transform(X)
    # summarize selected features
    print(features.shape)
    return features

X_unvariate = unvariateSelection(NUMBER_OF_FEATURES_WANTED)
Xtr_unvariate, Xva_unvariate, Ytr_unvariate, Yva_unvariate = model_selection.train_test_split(X_unvariate, Y, test_size=0.8, random_state=0)

[2016.51  4829.317  236.389  899.766 2315.829 1858.136 6252.796 1169.655
  789.05  2021.708    7.068   93.812]
(6497, 6)


In [44]:
# Recursive Feature Elimantion

def recursiveFeatureElimination(number_of_features, solver = 'lbfgs', max_iter = 2000):
    model = LogisticRegression(solver=solver, max_iter = max_iter)
    rfe = RFE(model, number_of_features)
    fit = rfe.fit(X, Y)
    features = fit.transform(X)
    print("Num Features: %d" % fit.n_features_)
    print("Selected Features: %s" % fit.support_)
    print("Feature Ranking: %s" % fit.ranking_)
    print(features.shape)
    return features

X_recursive = recursiveFeatureElimination(NUMBER_OF_FEATURES_WANTED)
Xtr_recursive, Xva_recursive, Ytr_recursive, Yva_recursive = model_selection.train_test_split(X_recursive, Y, test_size=0.8, random_state=0)

Num Features: 6
Selected Features: [ True  True  True False  True False False False  True  True False False]
Feature Ranking: [1 1 1 3 1 6 5 2 1 1 4 7]
(6497, 6)


In [43]:
#Principal Component Analysis
# feature extraction
def principalComponentAnalysis(number_of_features):
    pca = PCA(n_components= number_of_features)
    fit = pca.fit(X)
    features = fit.transform(X)
    # summarize components
    print("Explained Variance: %s" % fit.explained_variance_ratio_)
    print(fit.components_)
    print(features.shape)
    return features

X_principal = principalComponentAnalysis(NUMBER_OF_FEATURES_WANTED)
Xtr_principal, Xva_principal, Ytr_principal, Yva_principal = model_selection.train_test_split(X_principal, Y, test_size=0.8, random_state=0)

Explained Variance: [9.536e-01 4.062e-02 4.826e-03 4.944e-04 3.467e-04 1.364e-04]
[[-7.408e-03 -1.184e-03  4.869e-04  4.102e-02 -1.682e-04  2.305e-01
   9.722e-01  1.772e-06 -6.555e-04 -7.043e-04 -5.452e-03 -5.327e-04]
 [-5.372e-03 -7.870e-04 -2.472e-04  1.863e-02  6.684e-05  9.726e-01
  -2.314e-01  1.278e-06  6.480e-04  3.465e-04  2.879e-03  9.152e-03]
 [ 2.385e-02  9.047e-04  1.922e-03  9.952e-01  1.766e-04 -2.713e-02
  -3.585e-02  4.608e-04 -6.911e-03 -1.936e-03 -8.260e-02 -8.792e-03]
 [ 7.134e-01  2.400e-02  2.403e-02 -7.050e-02  9.905e-03  1.081e-02
   2.261e-03  1.439e-03 -2.761e-02  2.236e-02 -6.098e-01 -3.341e-01]
 [ 6.939e-01 -1.797e-02  5.098e-02  3.555e-02 -3.428e-03 -6.792e-04
   7.397e-03 -1.705e-04 -3.771e-02  1.672e-02  5.857e-01  4.118e-01]
 [ 5.683e-02  4.207e-02 -3.093e-03  3.496e-02 -2.405e-03  6.097e-03
   3.896e-05 -4.731e-04 -5.698e-03 -1.903e-02  5.270e-01 -8.459e-01]]
(6497, 6)


Now that we have the fearure selected data for all three methods we can use
random forest and neural netwrok classifiers to measure their performance

In [3]:
Xtr, Xva, Ytr, Yva = model_selection.train_test_split(X, Y, test_size=0.8, random_state=0)

In [54]:
#RANDOM FOREST

def runRandomForest(Xtr, Ytr,Xva, Yva, number_of_features, number_estimators = 20, leafs = 2, depth = 1):
    random_forest_classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=number_estimators, max_features = number_of_features, min_samples_leaf=leafs, max_depth=depth, oob_score=True) 
    # better: n_estimator = 50 (increase complexity, variance), 
    # max_depth=2/3 (increase complexity, variance), max feature is good, 
    # adjust from 7->3 (reduce complexity, variance, increase bias)
    classf = random_forest_classifier.fit(Xtr, Ytr)
    print("Score:", random_forest_classifier.score(Xtr, Ytr))
    print("Score:", random_forest_classifier.score(Xva, Yva))

    #Article about Area Under the Curve (AUC) https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5
    Ytr_score = classf.predict_proba(Xtr).T[1]
    Yva_score = classf.predict_proba(Xva).T[1]
    print("Training AUC: {}".format(roc_auc_score(Ytr, Ytr_score)))
    print("Validation AUC: {}".format( roc_auc_score(Yva, Yva_score)))

print("Base Without Feature Selection")
runRandomForest(Xtr, Ytr, Xva, Yva, X.shape[1])
print("\nWith Unvariate Selection")
runRandomForest(Xtr_unvariate, Ytr_unvariate, Xva_unvariate, Yva_unvariate, NUMBER_OF_FEATURES_WANTED)
print("\nWith Recursive Feature Elimination")
runRandomForest(Xtr_recursive , Ytr_recursive , Xva_recursive , Yva_recursive, NUMBER_OF_FEATURES_WANTED)
print("\nWith Principal Component Analysis")
runRandomForest(Xtr_principal, Ytr_principal, Xva_principal, Yva_principal, NUMBER_OF_FEATURES_WANTED)

Base Without Feature Selection
Score: 0.945342571208622
Score: 0.9249711427472105
Training AUC: 0.9844127691839102
Validation AUC: 0.9746768526557834

With Unvariate Selection
Score: 0.9361046959199384
Score: 0.9324740284724894
Training AUC: 0.9784093846745567
Validation AUC: 0.9794940483636126

With Recursive Feature Elimination
Score: 0.9207082371054658
Score: 0.9201616006156214
Training AUC: 0.9359437953581043
Validation AUC: 0.9279874512786385

With Principal Component Analysis
Score: 0.9260969976905312
Score: 0.9228549442093112
Training AUC: 0.9017553704626133
Validation AUC: 0.8990895672492384


In [52]:
#NEURAL NETWORK

def runNeuralNetwork(Xtr, Ytr,Xva, Yva, hidden_layers = (8, 8, 8), activation_func = 'relu', solver= 'adam', max_iter = 1000):
    mlp_classifier = MLPClassifier(hidden_layer_sizes = hidden_layers, activation = activation_func, solver = solver, max_iter = max_iter)
    classf = mlp_classifier.fit(Xtr, Ytr)
    print("Score:", mlp_classifier.score(Xtr, Ytr))
    print("Score:", mlp_classifier.score(Xva, Yva))

    Ytr_score = classf.predict_proba(Xtr).T[1]
    Yva_score = classf.predict_proba(Xva).T[1]
    print("Training AUC: {}".format(roc_auc_score(Ytr, Ytr_score)))
    print("Validation AUC: {}".format( roc_auc_score(Yva, Yva_score)))

print("Base Without Feature Selection")
runNeuralNetwork(Xtr, Ytr, Xva, Yva)
print("\nWith Unvariate Selection")
runNeuralNetwork(Xtr_unvariate, Ytr_unvariate, Xva_unvariate, Yva_unvariate)
print("\nWith Recursive Feature Elimination")
runNeuralNetwork(Xtr_recursive , Ytr_recursive , Xva_recursive , Yva_recursive )
print("\nWith Principal Component Analysis")
runNeuralNetwork(Xtr_principal, Ytr_principal, Xva_principal, Yva_principal)

Base Without Feature Selection
Score: 0.985373364126251
Score: 0.9774913428241632
Training AUC: 0.9969379621408991
Validation AUC: 0.9907784636815541

With Unvariate Selection
Score: 0.9799846035411856
Score: 0.9715275105809927
Training AUC: 0.9963398255978915
Validation AUC: 0.993097702903518

With Recursive Feature Elimination
Score: 0.970746728252502
Score: 0.9630627164293959
Training AUC: 0.9922698125461068
Validation AUC: 0.9852536512876449

With Principal Component Analysis
Score: 0.9522709776751347
Score: 0.9470950365525203
Training AUC: 0.9895396566182637
Validation AUC: 0.9743306598085669


In [57]:
#Decision Tree

def runDecisionTree(Xtr, Ytr,Xva, Yva, criterion = 'entropy', depth = 10):
    dt_classifier = DecisionTreeClassifier(criterion = criterion, max_depth = depth)
    classf = dt_classifier.fit(Xtr, Ytr)
    print("Score:", dt_classifier.score(Xtr, Ytr))
    print("Score:", dt_classifier.score(Xva, Yva))

    Ytr_score = classf.predict_proba(Xtr).T[1]
    Yva_score = classf.predict_proba(Xva).T[1]
    print("Training AUC: {}".format(roc_auc_score(Ytr, Ytr_score)))
    print("Validation AUC: {}".format( roc_auc_score(Yva, Yva_score)))

print("Base Without Feature Selection")
runDecisionTree(Xtr, Ytr, Xva, Yva)
print("\nWith Unvariate Selection")
runDecisionTree(Xtr_unvariate, Ytr_unvariate, Xva_unvariate, Yva_unvariate)
print("\nWith Recursive Feature Elimination")
runDecisionTree(Xtr_recursive , Ytr_recursive , Xva_recursive , Yva_recursive )
print("\nWith Principal Component Analysis")
runDecisionTree(Xtr_principal, Ytr_principal, Xva_principal, Yva_principal)

Base Without Feature Selection
Score: 1.0
Score: 0.9763370527125818
Training AUC: 1.0
Validation AUC: 0.9691416847153711

With Unvariate Selection
Score: 1.0
Score: 0.9759522893420547
Training AUC: 1.0
Validation AUC: 0.9665610599366561

With Recursive Feature Elimination
Score: 1.0
Score: 0.9653712966525587
Training AUC: 1.0
Validation AUC: 0.9513277728020975

With Principal Component Analysis
Score: 1.0
Score: 0.9392073874567142
Training AUC: 1.0
Validation AUC: 0.9194067877174635
