# Iris Species Classification


*   Dataset Used - https://www.kaggle.com/datasets/arshid/iris-flower-dataset
*   Python Modules Used - numpy, pandas, matplotlib
*   Machine Learning Algorithms used - Logistic Regression
*   Features of Species - 
  * Sepal Length (X[0])
  * Sepal Width (X[1])
  * Petal Length (X[2])
  * Petal Width (X[3])

*   Species (Target Variable Y) - 
  * Iris Setosa (Y = 0)
  * Iris versicolor (Y = 1) 
  * Iris virginica (Y = 2)

*   Train Test Split - 80/20
*   Model Training
*   Model Testing
*   Give values and predict 


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
flowers = pd.read_csv('IRIS.csv')
flowers['species']

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: species, Length: 150, dtype: object

In [3]:
X =flowers.drop(columns=['species']).values
y =flowers['species'].values

In [4]:
def train_test_split(X, y, test_split=0.2):
  n = X.shape[0]
  n_test = int(n * test_split)
  test_indices = np.random.choice(range(n), n_test, replace=False)
  train_indices = np.delete(range(n), test_indices)
  X_train = np.array(X[train_indices])
  X_test = np.array(X[test_indices])
  y_train = np.array(y[train_indices])
  y_test = np.array(y[test_indices])
  return X_train, X_test, y_train, y_test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
class LogisticRegression:
  def __init__(self, learning_rate=0.0001, num_iterations=100):
    self.learning_rate = learning_rate
    self.num_iterations = num_iterations
    self.weights = None
    self.bias = None

  def sigmoid(self,z):
    return 1/(1+np.exp(-z))

  def grad_dw(self,x,y,w,b,alpha,N):
    dw = (1 / N) * np.dot(x.T, (self.sigmoid(np.dot(x, w)+b) - y)) 
    return dw
  
  def grad_b(self,x,y,w,b, N):
    db = (1 / N) * np.sum(self.sigmoid(np.dot(x, w)+b) - y, axis=0)
    return db
  
  def hot_ones_encoding(self, y):
      classes = np.unique(y)
      # print(len(classes))
      one_hot = np.zeros((y.shape[0], len(classes)))
      for idx, label in enumerate(classes):
          # print(idx, label)
          # print(y == label)
          one_hot[y == label, idx] = 1
      # print("one", one_hot)
      return one_hot


  def fit(self, X, y):
    samples = X.shape[0]
    features = X.shape[1]
    num_classes = np.unique(y).shape[0]
    self.weights = np.zeros((features, num_classes))
    self.bias = np.zeros((1,num_classes))
    one_hot_y = self.hot_ones_encoding(y)
    for i in range(self.num_iterations): 
      z = np.dot(X, self.weights) + self.bias
      y_pred = self.sigmoid(z)

      dw = self.grad_dw(X, one_hot_y, self.weights, self.bias, self.learning_rate, samples)
      db = self.grad_b(X, one_hot_y, self.weights, self.bias, samples)

      self.weights -= self.learning_rate*dw
      self.bias -= self.learning_rate*db
  
  def predict(self, X):
    z = np.dot(X, self.weights) + self.bias
    y_pred = self.sigmoid(z)
    # print(y_pred)
    return np.argmax(y_pred, axis=1)


In [20]:
LR_model = LogisticRegression(learning_rate = 0.1 , num_iterations= 10000)

In [21]:
mapping = {0: "Iris-setosa", 1: "Iris-versicolor", 2:"Iris-virginica"}

LR_model.fit(X_train,y_train)


In [22]:
y_pred = LR_model.predict(X_train)
count = 0
for i in range(len(y_pred)):
  if mapping[y_pred[i]] == y_train[i]:
    count+= 1
print("Train Accuracy = ", count * 100/ len(y_train))
  
y_pred = LR_model.predict(X_test)
count = 0
for i in range(len(y_pred)):
  if mapping[y_pred[i]] == y_test[i]:
    count+= 1
print("Test Accuracy = ", count * 100/ len(y_test))

Train Accuracy =  99.16666666666667
Test Accuracy =  93.33333333333333
