In [1]:
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
import pandas as pd
import numpy as np

In [4]:
df=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",delimiter=",")

In [5]:
df.columns=["sepal_len","sepal_width","petal_len","petal_width","species"]

In [6]:
df

Unnamed: 0,sepal_len,sepal_width,petal_len,petal_width,species
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa
6,5.0,3.4,1.5,0.2,Iris-setosa
7,4.4,2.9,1.4,0.2,Iris-setosa
8,4.9,3.1,1.5,0.1,Iris-setosa
9,5.4,3.7,1.5,0.2,Iris-setosa


# DECISION TREE CLASSIFIER

# Decision Tree Using Random Subsampling

In [7]:
data=df.values #convert to 2-D matrix

In [8]:
X=data[:,0:-1]
Y=data[:,-1]

In [9]:
val_size=0.25
seed=3

In [10]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(X,Y,test_size=val_size,random_state=seed)

In [11]:
dtree = DecisionTreeClassifier()

In [12]:
dtree.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [13]:
predictions = dtree.predict(x_test)

In [14]:
print("Accuracy using Random Subsampling : ", accuracy_score(y_test,predictions))

Accuracy using Random Subsampling :  0.947368421053


In [15]:
print("Confusion Matrix  : \n\n", confusion_matrix(y_test,predictions))

Confusion Matrix  : 

 [[15  0  0]
 [ 0 13  1]
 [ 0  1  8]]


In [16]:
print("Classification Report : \n\n", classification_report(y_test,predictions))

Classification Report : 

                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        15
Iris-versicolor       0.93      0.93      0.93        14
 Iris-virginica       0.89      0.89      0.89         9

    avg / total       0.95      0.95      0.95        38



# Decision Tree Using Cross Validation

In [17]:
eval = 'accuracy'
seed = 4
model=DecisionTreeClassifier()

In [18]:
kfold = model_selection.KFold(n_splits=5, shuffle= True, random_state= seed)

In [19]:
results = model_selection.cross_val_score(model, X, Y, cv= kfold, scoring= eval)

In [20]:
print("Accuracy using Cross Validation : ",results.mean())

Accuracy using Cross Validation :  0.953333333333


# K-NEIGHBORS CLASSIFIER

# Computing K-Neighbours Classifier Using Random Subsampling

In [22]:
classifier = KNeighborsClassifier()

In [23]:
classifier.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [24]:
predictions = classifier.predict(x_test)

In [25]:
print("Accuracy using K-Neighbours classifier : ",accuracy_score(y_test,predictions))

Accuracy using K-Neighbours classifier :  0.973684210526


In [26]:
print("Confusion Matrix : \n\n",confusion_matrix(y_test,predictions))

Confusion Matrix : 

 [[15  0  0]
 [ 0 14  0]
 [ 0  1  8]]


In [27]:
print("Classification Report : \n\n",classification_report(y_test, predictions))

Classification Report : 

                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        15
Iris-versicolor       0.93      1.00      0.97        14
 Iris-virginica       1.00      0.89      0.94         9

    avg / total       0.98      0.97      0.97        38



# Computing K-Neighbours Classifier Using Cross Validation

In [28]:
eval = 'accuracy'
seed = 4
model= KNeighborsClassifier()

In [29]:
kfold = model_selection.KFold(n_splits=5, shuffle= True, random_state= seed)

In [30]:
results = model_selection.cross_val_score(model, X, Y, cv= kfold, scoring= eval)

In [31]:
print("Accuracy using Cross Validation : ",results.mean())

Accuracy using Cross Validation :  0.959770114943


# COMPARISON OF CLASSIFIERS 

In [33]:
seed = 4
models = []

models.append(('KNN',KNeighborsClassifier()))
models.append(('CART',DecisionTreeClassifier()))

scoring='accuracy'

results = []

for name, model in models:
    kfold = model_selection.KFold(n_splits=5, shuffle= True, random_state= seed)
    algo_result = model_selection.cross_val_score(model, X, Y, cv= kfold, scoring= eval)
    results.append(algo_result)
    str = "%s : %f" % (name, algo_result.mean())
    print(str)

KNN : 0.959770
CART : 0.946667
