In [56]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from data_set import encode_target
from sklearn.feature_selection import *

# to download these libraries
# open the terminal and type : pip install +library name

# example : pip install pandas


names = ["length", "width", "size", "conc", "conc1",
         "asym", "m3long", "m3trans", "alpha", "dist", "class"]

features =  ["length", "width", "size", "conc", "conc1",
         "asym", "m3long", "m3trans", "alpha", "dist"]

data = pd.read_csv('all_data.txt', names=names)


data.head()


Unnamed: 0,length,width,size,conc,conc1,asym,m3long,m3trans,alpha,dist,class
0,99.105,43.0125,4.0978,0.0933,0.0481,-49.5231,78.408,27.7475,0.365,290.201,g
1,18.1865,14.1157,2.3829,0.5756,0.3458,7.6399,-16.0789,-5.9486,83.5322,210.051,h
2,18.25,15.333,2.3434,0.5125,0.3016,-7.7452,16.7866,8.702,68.708,207.648,g
3,25.4748,16.7252,2.4829,0.5066,0.3174,-7.8774,-19.3421,-14.3531,47.2952,156.579,h
4,178.482,102.199,3.3582,0.1389,0.0708,96.516,123.055,-86.3306,55.561,270.063,h


In [57]:
# feature selection using The Recursive Feature Elimination

# It uses the model accuracy to identify which attributes (and combination 
# of attributes) contribute the most to predicting the target attribute

# The classification algorithm below uses RFE with the logistic regression
# algorithm to select the top 3 features. The choice of algorithm does not 
# matter too much as long as it is skillful and consistent.

# separate classification from data
from sklearn.linear_model import LogisticRegression
array = data.values
X = array[:,0:10]
Y = array[:,10]
# feature extraction
model = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=100) 
rfe = RFE(model, 4)
fit = rfe.fit(X, Y)
print("Num Features:", fit.n_features_)
print("Selected Features:", fit.support_)
print("Feature Ranking:", fit.ranking_)

# features selected : size, conc, conc1, alpha

Num Features: 4
Selected Features: [ True  True  True False False False False False  True False]
Feature Ranking: [1 1 1 7 4 6 3 5 1 2]


In [58]:
# feature selection using feature importance

# Bagged decision trees like Random Forest and Extra Trees
# can be used to estimate the importance of features.

from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier(n_estimators=1000)
model.fit(X, Y)
print(model.feature_importances_)

# features selected : alpha, length, size, width

[0.11362326 0.10113134 0.10155619 0.06458126 0.06606761 0.06456663
 0.08259608 0.05842145 0.27419066 0.07326551]


In [59]:
# making another train-test data based on feature selections

from sklearn.model_selection import train_test_split

data_elim = data.drop(columns=["conc", "asym", "m3long", "m3trans", "dist", "conc1"])
array_e = data_elim.values
Xe = array_e[:,0:4]
Ye = array_e[:,4]

# splitting data into train-test 

Xe_train, Xe_test, ye_train, ye_test = train_test_split( Xe, Ye, test_size = 0.3, random_state = 100)

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)

data_elim.head()

Unnamed: 0,length,width,size,alpha,class
0,99.105,43.0125,4.0978,0.365,g
1,18.1865,14.1157,2.3829,83.5322,h
2,18.25,15.333,2.3434,68.708,g
3,25.4748,16.7252,2.4829,47.2952,h
4,178.482,102.199,3.3582,55.561,h


In [60]:
# decision tree classification
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix


dt = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=5) 
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

dot_data = tree.export_graphviz(dt, out_file=None, class_names=True)
graph = graphviz.Source(dot_data)
graph.render("Decision tree ")

print ("evaluation of decision tree without feature selection :")
print ("Accuracy:", accuracy_score(y_test, y_pred)*100)
print ("precision:", precision_score(y_test, y_pred, average="macro")*100)
print ("recall:", recall_score(y_test, y_pred, average="macro")*100)
print ("F-measure:", f1_score(y_test, y_pred, average="macro")*100)
print ("confusion:", confusion_matrix(y_test, y_pred)*100)
print()

dt = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=5) 
dt.fit(Xe_train, ye_train)

ye_pred = dt.predict(Xe_test)

print ("evaluation of decision tree after feature selection :")
print ("Accuracy:", accuracy_score(ye_test, ye_pred)*100)
print ("precision:", precision_score(ye_test, ye_pred, average="macro")*100)
print ("recall:", recall_score(ye_test, ye_pred, average="macro")*100)
print ("F-measure:", f1_score(ye_test, ye_pred, average="macro")*100)
print ("confusion:", confusion_matrix(ye_test, ye_pred)*100)

dot_data = tree.export_graphviz(dt, out_file=None, class_names=True)
graph = graphviz.Source(dot_data)
graph.render("Decision tree with fs")

evaluation of decision tree without feature selection :
Accuracy: 79.40199335548172
precision: 79.95304376991447
recall: 79.5973265576668
F-measure: 79.36640877441754
confusion: [[50300  8100]
 [16700 45300]]

evaluation of decision tree after feature selection :
Accuracy: 79.40199335548172
precision: 79.46254017694423
recall: 79.46807335395492
F-measure: 79.40193651818555
confusion: [[47700 10700]
 [14100 47900]]


'Decision tree with fs.pdf'