# Data Preparation

In [None]:
# Load/parse CSV, add columns since there is none

import pandas as pd
import numpy as np

# load the file using pandas
# you need the header none otherwise it assumes first row is a header
df = pd.read_csv('iris.csv',header=None)

# the file has no column headers so add them
df.columns = ['sepal length', 'sepal width', 'petal length',
              'petal width', 'labels'] # labels is type of flower

# df.info() # displays general information about data frame
df.head() # displays data frame as a table

Unnamed: 0,sepal length,sepal width,petal length,petal width,labels
0,5.7,2.9,4.2,1.3,2
1,4.8,3.0,1.4,0.3,1
2,5.0,3.2,1.2,0.2,1
3,6.5,3.0,5.8,2.2,3
4,7.2,3.6,6.1,2.5,3


In [None]:
# Add labels to a new numpy array "labels"
# Remove labels from original dataframe, and
# Put other data attributes into numpy array "data"

# get the labels and turn into numpy array
labels = df['labels'].to_numpy()

# drop the labels from the dataframe and put the cols of data attributes into numpy array
# because we want to predict these labels
data = (df.drop('labels', axis=1)).to_numpy()

# dimensionality of data (rows x col)
print('shape of data', data.shape)

# number of rows
print('length of labels', len(labels))

# get the unique labels
print('unique labels', np.unique(labels))

shape of data (150, 4)
length of labels 150
unique labels [1 2 3]


In [None]:
# Prepare data for classification

# Split the data into train set and test set
# Need to do that in order to first train the model w/ the train data and labels
# and then test with the test data
# our goal is to see how well it predicts the test labels based on the model

# this split is 80-20 train-test
# you can adjust this in the test_size parameter
# the random state is to give the same seed for the random split
# see the documentation for more details

from sklearn.model_selection import train_test_split

# X_train: the train data points
# X_test: the test data points
# y_train: the labels for the train points
# y_test: the labels for the test points

# Thomas Trinh

X_train, X_test, y_train, y_test = train_test_split(data,
                                                    labels,
                                                    test_size=0.25,
                                                    random_state=42)

In [None]:
# Questions
# What train-split percentage should we use? Exmaple uses 80-20 indicated by 0.2

# Begin Classification

In [None]:
# Thomas Trinh
# conduct classification on data with scikit learn and the kNN
from sklearn.neighbors import KNeighborsClassifier

# Create instance of KNN classifier using k = 3 (can adjust to different values)
model = KNeighborsClassifier(n_neighbors=3)

# Fit the model to our training data and their labels
# for this use X_train (train data) and y_train (train labels)
# then use the model.fit() function
model.fit(X_train, y_train)

# Predict the class (labels) of the unseen test data
# the y_pred variable will hold the predicted labels
# here use the model.predict() function and the
# X_test (the test set data points)
y_pred = model.predict(X_test)

# Thomas Trinh
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[14,  0,  0],
       [ 0,  9,  0],
       [ 0,  1, 14]])

In [None]:
# Can also use built-in metric to calculate the accuracy for better readability
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
# to see the actual data that is being predicted:

# print the test labels (the labels you are trying to predict)
# print(y_test)

# see what it predicted
# print(y_pred)

In [None]:
# Thomas Trinh
# # 2. Using my original code, add code in the appropriate place in the notebook to display the confusion matrix.
# Screenshot must show only the code and output related to this task, not the entire code (with your name in the code comments).
# See the GUIDE link for help on this or https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.htmlLinks to an external site.
# They have an example there too.
# Also, write 1-2 sentences in the doc under the screenshot about what you observe from the confusion matrix for each class.