# <center>Classification of iris flowers</center>

### <div align='right'>Filip Kowalski</right>

In [15]:
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import matplotlib
%matplotlib notebook

# Load Data

In [16]:
filename='iris.data.csv'
names=['sepal-length','sepal-width','petal-length','petal-width','class']
dataset=read_csv(filename,names=names)

# Looking on data 

In [17]:
dataset.head(10)

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [18]:
print(dataset.shape)

(150, 5)


# Descriptions of data

In [19]:
print(dataset.describe())

       sepal-length  sepal-width  petal-length  petal-width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000


In [20]:
print(dataset.groupby('class').size())

class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64


In [21]:
dataset.plot(kind='box',subplots=True,layout=(2,2),sharex=False,sharey=False)
plt.show()

<IPython.core.display.Javascript object>

In [9]:
dataset.hist()
plt.show()

<IPython.core.display.Javascript object>

In [22]:
scatter_matrix(dataset)
plt.show()

<IPython.core.display.Javascript object>

# Split-out validation dataset 

In [23]:
array=dataset.values

In [24]:
X=array[:,0:4]
Y=array[:,4]
validation_size=0.20
seed=4
X_train,X_validation,Y_train,Y_validation = train_test_split(X,Y,test_size=validation_size,random_state=seed)

# Spot-check algorithms 

In [25]:
models=[]
models.append(('LR',LogisticRegression()))
models.append(('LDA',LinearDiscriminantAnalysis()))
models.append(('KNC',KNeighborsClassifier()))
models.append(('DTCl',DecisionTreeClassifier()))
models.append(('GNB',GaussianNB()))
models.append(('SVC',SVC()))

# Evaluate models 

In [26]:
results=[]
names=[]
for name,model in models:
    kfold=KFold(n_splits=10,random_state=seed)
    cv_results=cross_val_score(model,X_train,Y_train,cv=kfold,scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg="%s: %f (%f)" % (name,cv_results.mean(),cv_results.std())
    print(msg)

LR: 0.966667 (0.040825)
LDA: 0.991667 (0.025000)
KNC: 0.975000 (0.038188)
DTCl: 0.950000 (0.055277)
GNB: 0.950000 (0.055277)
SVC: 0.983333 (0.033333)


# Compare Algorithms

In [27]:
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax=fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

<IPython.core.display.Javascript object>

# Make Predictions

In [28]:
svc=SVC()
svc.fit(X_train,Y_train)
predictions=svc.predict(X_validation)
print(accuracy_score(Y_validation,predictions))
print(confusion_matrix(Y_validation,predictions))
print(classification_report(Y_validation,predictions))



0.966666666667
[[16  0  0]
 [ 0  4  1]
 [ 0  0  9]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        16
Iris-versicolor       1.00      0.80      0.89         5
 Iris-virginica       0.90      1.00      0.95         9

    avg / total       0.97      0.97      0.97        30



# Conclusions

We get realy great classification prediction. But the data set was relativity small it's possible, that another model could be better.