### Importing Iris Dataset 

In [4]:
from pandas import read_csv
names = ['SEPAL_LENGTH', 'SEPAL_WIDTH', 'PETAL_LENGTH', "PETAL_WIDTH", "CLASS"] #dataset column names, derived from https://archive.ics.uci.edu/ml/datasets/Iris
dataset = read_csv('iris.csv', names=names) #using read_csv to import dataset

In [6]:
dataset.head() #checking data was correctly imported

Unnamed: 0,SEPAL_LENGTH,SEPAL_WIDTH,PETAL_LENGTH,PETAL_WIDTH,CLASS
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [7]:
dataset.groupby('CLASS').size() #count different values within the CLASS field

CLASS
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64

### Split Dataset Into Training and Testing Partitions

In [9]:
from sklearn.model_selection import train_test_split

x_values = dataset.values[:,0:4] #grab all the data except the last column
y_values = dataset.values[:, 4] #grab the last column of data

#Below split data via an 80/20 split (80% train data, 20% test data) - commonly done on most datasets
#random_state shuffles the data before applying the split
train_x, test_x, train_y, test_y = train_test_split(x_values, y_values, test_size=0.2, random_state=1)

### Creating Models

In [13]:
#Dataset will require supervised learning techniques to be applied to it
#The following code will apply a range of algorithms to determine the best possible choice
#according to the model's accuracy
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

model_1 = SVC(gamma='auto') #model using Support Vector Classification algorithm
model_2 = LogisticRegression(solver='liblinear', multi_class='ovr') #model using Logistic Regression Classifier algorithm
model_3 = DecisionTreeClassifier() #model using Decision Tree Classifier algorithm

### Testing Models

In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
#Using cross validation (10-fold) technique to evaluate models (estimate their accuracy)
#Cross validation helps to reduce bias

#test_model_accuracy method uses cross validation methods from sklearn
#It works in the following way:
# 1. Shuffle data randomly
# 2. Splits data into 10 groups. 
# 3. For each group: 
#   3.1 Pick one group as test dataset, treat rest as training datasets
#   3.2 Apply model to training sets and evaluate on the test set
# 4. Return mean of evaluation scores
def test_model_accuracy(model):
    k_fold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    result = cross_val_score(model, train_x, train_y, cv=k_fold, scoring='accuracy')
    return result.mean()

#Print the estimated accuracy of each model
print("SVC Model: " + str(test_model_accuracy(model_1)))
print("LR Model: " + str(test_model_accuracy(model_2)))
print("DTC Model: " + str(test_model_accuracy(model_3)))
#May vary slightly - but the SVC model proves to be the best model with 98% accuracy
#The LR and DTC Models are very similar with ~95% accuracy

SVC Model: 0.9833333333333332
LR Model: 0.9559090909090908
DTC Model: 0.951165501165501


### Making predictions with SVM Model

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

final_model = model_1
final_model.fit(train_x, train_y) #Fitting model on training dataset
predictions = final_model.predict(test_x) #Make predictions on the test dataset

print(accuracy_score(test_y, predictions)) #Returns accuracy of the model's predictions
print(confusion_matrix(test_y, predictions)) #Creates a confusion matrix based on the model's predictions
#With the confusion matrix, want more values along the diagonal (from top-left to bottom-right)
#and few values elsewhere (these values represent incorrect classifications)

#Accuracy of ~97% on test data
#Model is not overfitted or underfitted (as high prediction accuracy on test data)

0.9666666666666667
[[11  0  0]
 [ 0 12  1]
 [ 0  0  6]]


### Additional Prediction Analysis

In [20]:
from sklearn.metrics import classification_report

print(classification_report(test_y, predictions))
#Below formulas derived from https://tinyurl.com/ycp3vncm

#Precision = TruePositives/(TruePositives + FalsePositives)
#Precision of 1 (no false positives) for both Setosa and Versicolor flowers
#Some false positives produced for Virginica flower

#Recall = TruePositives/(TruePositives + FalseNegatives)
#Iris-Setosa has no false negatives or false positives - perfect prediction
#Iris-Versicolor had 1 false negative (also determined from confusion matrix)
#Iris-Virginica had no false negatives

#F1-score = 2*(Recall * Precision)/(Recall + Precision)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       1.00      0.92      0.96        13
 Iris-virginica       0.86      1.00      0.92         6

      micro avg       0.97      0.97      0.97        30
      macro avg       0.95      0.97      0.96        30
   weighted avg       0.97      0.97      0.97        30

