In [30]:
#import statements
import pandas as pd
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
import sklearn.metrics as met
from sklearn.model_selection import train_test_split
import scipy.linalg as scipy

Before we begin we insert a constant one term so the bias will be subsumed into our weight vector

In [31]:
df = pd.read_csv('data/iris.csv') #data frame
feature_names = df.keys() # Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width','species'],dtype='object')

df.insert(0, 'bias', 1)

In this cell we sort through the entries of our dataset by label and randomly split the data such that 40 of the samples are training ans the remaining 10 are added to test.

In [32]:
s1 = list()
s2 = list()


for species in df['species'].unique():
    train, test = train_test_split(df[df['species'] == species], test_size=0.2)
    s1.append(train)
    s2.append(test)

trainDF = pd.concat(s1)
testDF = pd.concat(s2)




X_train = trainDF.iloc[0:120,0:5].to_numpy()
Y_train = trainDF.iloc[0:120,5].to_numpy()

X_test = testDF.iloc[0:30,0:5].to_numpy()
Y_test = testDF.iloc[0:30,5].to_numpy()

In this cell we make label vectors for our training data where the number 1 is assigned to the target label of our binary classifier and -1 is assigned to the other labels. We repeat the process for each label.

In [33]:

list_train = list()


for species in df['species'].unique():
    label_train = np.array([1 if item == species else -1 for item in Y_train])
    list_train.append(label_train)
    


The below functions take an array of array of  real numbers numbers and maps it to the string label corresponding to the highest real number. This makes comparison easier later.

In [34]:
# Define a mapping function to map numbers to strings
def map_to_string(number):
    mapping = {
        0: 'Iris-setosa',
        1: 'Iris-versicolor',
        2: 'Iris-virginica',
    }
    return mapping.get(number, 'Unknown')

def get_classification(data,weights):
    guesses = np.argmax(data@(np.vstack(weights).T),axis=1)
    string_vector = np.vectorize(map_to_string)(guesses)
    return string_vector
    

In this cell we obtain the QR factorization of our training data and use it to solve for our ideal weights to minimize loss for each one of our binary classifiers. We then pass the weights for our 3 classifiers to the helper function to get our classifications. 

We then get the confusion matrix and observe results.

It clearly classified all setosa flowers correctly but made more mistakes with versicolor and verginica. They were misclassified almost equally with 27 versicolor being classified correctly and 30 virginica being classified correctly.

Error rate = 13.3%

This means our model has preformed well on the training data


In [41]:
#QR composition
Q, R = np.linalg.qr(X_train)

weight_list = list()
for label in list_train:
    weight_list.append(scipy.solve_triangular(R,(Q.T@label))) #returns our weights
training_classifications = get_classification(X_train,weight_list)
df_confusion = pd.crosstab(Y_train, training_classifications, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(df_confusion)
print("Error rate is ", np.sum(Y_train != training_classifications)/Y_train.size)
print(R.shape)


Predicted        Iris-setosa  Iris-versicolor  Iris-virginica  All
Actual                                                            
Iris-setosa               40                0               0   40
Iris-versicolor            0               29              11   40
Iris-virginica             0                5              35   40
All                       40               34              46  120
Error rate is  0.13333333333333333
(5, 5)


Again the classifier predicted all setosa correctly. This could mean that setosa flowers are more distinctive in terms of these features to other flowers. The error rate was 5/30 which 16%. This is close to our training accuracy and is stil very good. This indicates that our model is not overfitting and is gereralizing towards unseen data

In [29]:

tst_classifications = get_classification(X_test,weight_list)
df_confusion_test = pd.crosstab(Y_test, tst_classifications, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(df_confusion_test)
print()
print("Error rate is ", np.sum(Y_test != tst_classifications)/Y_test.size)

Predicted        Iris-setosa  Iris-versicolor  Iris-virginica  All
Actual                                                            
Iris-setosa               10                0               0   10
Iris-versicolor            0                8               2   10
Iris-virginica             0                3               7   10
All                       10               11               9   30

Error rate is  0.16666666666666666
