In [7]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from data_set import encode_target
from sklearn.feature_selection import *

# to download these libraries
# open the terminal and type : pip install +library name

# example : pip install pandas


names = ["length", "width", "size", "conc", "conc1",
         "asym", "m3long", "m3trans", "alpha", "dist", "class"]

features =  ["length", "width", "size", "conc", "conc1",
         "asym", "m3long", "m3trans", "alpha", "dist"]

data = pd.read_csv('all_data.txt', names=names)


data.head()


Unnamed: 0,length,width,size,conc,conc1,asym,m3long,m3trans,alpha,dist,class
0,27.4068,9.8829,2.3086,0.4177,0.2088,36.2978,18.8603,-6.7724,10.3529,192.238,g
1,31.034,12.874,2.6474,0.4257,0.2196,9.2704,24.718,-7.5406,6.308,160.135,g
2,61.5901,49.8072,3.6338,0.192,0.1148,-33.1866,-76.602,48.0003,84.7875,285.8583,h
3,20.6089,7.1467,2.1973,0.7365,0.3778,23.2209,11.468,-5.8253,36.0741,129.046,g
4,20.3722,12.903,2.2844,0.4519,0.2312,-16.5144,-5.7444,7.3167,50.6061,175.856,g


In [2]:
# feature selection using The Recursive Feature Elimination

# It uses the model accuracy to identify which attributes (and combination 
# of attributes) contribute the most to predicting the target attribute

# The classification algorithm below uses RFE with the logistic regression
# algorithm to select the top 3 features. The choice of algorithm does not 
# matter too much as long as it is skillful and consistent.

# separate classification from data
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
array = data.values
X = array[:,0:10]
Y = array[:,10]

# feature extraction
# could use any model of classification to calculate ranking
model = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=100) 
rfe = RFE(model, 4)
fit = rfe.fit(X, Y)
print("Num Features:", fit.n_features_)
print("Selected Features:", fit.support_)
print("Feature Ranking:", fit.ranking_)

# features selected : size, conc, conc1, alpha

Num Features: 4
Selected Features: [ True  True  True False False False False False  True False]
Feature Ranking: [1 1 1 7 3 6 4 5 1 2]


In [3]:
# feature selection using feature importance

# Bagged decision trees like Random Forest and Extra Trees
# can be used to estimate the importance of features.

from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier(n_estimators=1000)
model.fit(X, Y)
print(model.feature_importances_)

# features selected : alpha, length, size, width

[0.1213271  0.10121334 0.10297018 0.06802143 0.06534078 0.06379981
 0.08178121 0.06078391 0.26596456 0.06879768]


In [4]:
# making another train-test data based on feature selections

from sklearn.model_selection import train_test_split

data_elim = data.drop(columns=["conc", "asym", "m3long", "m3trans", "dist", "conc1"])
array_e = data_elim.values
Xe = array_e[:,0:4]
Ye = array_e[:,4]

# splitting data into train-test 

Xe_train, Xe_test, ye_train, ye_test = train_test_split( Xe, Ye, test_size = 0.3, random_state = 100)

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)

data_elim.head()

Unnamed: 0,length,width,size,alpha,class
0,27.4068,9.8829,2.3086,10.3529,g
1,31.034,12.874,2.6474,6.308,g
2,61.5901,49.8072,3.6338,84.7875,h
3,20.6089,7.1467,2.1973,36.0741,g
4,20.3722,12.903,2.2844,50.6061,g


In [5]:
# decision tree classification
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix


dt = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=5) 
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

dot_data = tree.export_graphviz(dt, out_file=None, class_names=True)
graph = graphviz.Source(dot_data)
graph.render("Decision tree")

print ("evaluation of decision tree without feature selection :")
print ("Accuracy:", accuracy_score(y_test, y_pred)*100)
print ("precision:", precision_score(y_test, y_pred, average="macro")*100)
print ("recall:", recall_score(y_test, y_pred, average="macro")*100)
print ("F-measure:", f1_score(y_test, y_pred, average="macro")*100)
print ("confusion:", confusion_matrix(y_test, y_pred)*100)
print()

dt = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=5) 
dt.fit(Xe_train, ye_train)

ye_pred = dt.predict(Xe_test)

print ("evaluation of decision tree after feature selection :")
print ("Accuracy:", accuracy_score(ye_test, ye_pred)*100)
print ("precision:", precision_score(ye_test, ye_pred, average="macro")*100)
print ("recall:", recall_score(ye_test, ye_pred, average="macro")*100)
print ("F-measure:", f1_score(ye_test, ye_pred, average="macro")*100)
print ("confusion:", confusion_matrix(ye_test, ye_pred)*100)

dot_data = tree.export_graphviz(dt, out_file=None, class_names=True)
graph = graphviz.Source(dot_data)
graph.render("Decision tree with fs")

evaluation of decision tree without feature selection :
Accuracy: 79.56810631229236
precision: 79.61647019757596
recall: 79.53800298062593
F-measure: 79.54552988364769
confusion: [[49900 11100]
 [13500 45900]]

evaluation of decision tree after feature selection :
Accuracy: 80.39867109634552
precision: 80.50077639751552
recall: 80.35767511177347
F-measure: 80.36480834984894
confusion: [[50900 10100]
 [13500 45900]]


'Decision tree with fs.pdf'

Note: you may need to restart the kernel to use updated packages.
