In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import warnings

In [2]:
col_names = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']

In [3]:
data = pd.read_csv('./Iris.csv',names=col_names)
data

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3,5.2,2.3,Iris-virginica
146,6.3,,5,1.9,Iris-virginica
147,6.5,3,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [4]:
data['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [5]:
data.shape

(150, 5)

# Removing/Replacing NA and missing values 

In [6]:
data.isna().sum()

SepalLengthCm    3
SepalWidthCm     3
PetalLengthCm    5
PetalWidthCm     0
Species          0
dtype: int64

In [7]:
#finding the missing values

(data=='?').sum()

SepalLengthCm    7
SepalWidthCm     3
PetalLengthCm    4
PetalWidthCm     0
Species          0
dtype: int64

In [8]:
data = data.replace('?',np.nan)

In [9]:
data.isna().sum().sum()

25

In [10]:
for col in data.columns:
    print(col,data[col].unique())
    print('\n')

SepalLengthCm ['5.1' '4.9' '4.7' '4.6' '5' '5.4' '4.4' '4.8' '4.3' '5.8' '5.7' nan '5.2'
 '5.5' '4.5' '5.3' '7' '6.4' '6.9' '6.5' '6.3' '6.6' '5.9' '6' '5.6' '6.7'
 '6.2' '6.1' '6.8' '7.1' '7.6' '7.3' '7.2' '7.7' '7.4' '7.9']


SepalWidthCm ['3.5' '3' '3.2' '3.1' '3.6' '3.9' '3.4' '2.9' '3.7' '4' '4.4' '3.8' '3.3'
 nan '4.1' '4.2' '2.3' '2.8' '2.4' '2.7' '2' '2.2' '2.5' '2.6']


PetalLengthCm ['1.4' '1.3' '1.5' '1.7' nan '1.6' '1.9' '4.7' '4.5' '4' '4.6' '3.3' '3.9'
 '3.5' '4.2' '3.6' '4.4' '4.8' '4.9' '4.3' '5' '3.8' '3.7' '5.1' '4.1' '3'
 '6' '5.9' '5.6' '5.8' '6.6' '6.3' '6.1' '5.3' '5.5' '6.7' '6.9' '5.7'
 '6.4' '5.4' '5.2']


PetalWidthCm [0.2 0.4 0.3 0.1 0.5 0.6 1.4 1.5 1.3 1.6 1.  1.1 1.8 1.2 1.7 2.5 1.9 2.1
 2.2 2.  2.4 2.3]


Species ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']




In [11]:
cols_to_fill = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm']

# Replace non-numeric values with NaN
data['SepalLengthCm'] = pd.to_numeric(data['SepalLengthCm'])
data['SepalWidthCm'] = pd.to_numeric(data['SepalWidthCm'])
data['PetalLengthCm'] = pd.to_numeric(data['PetalLengthCm'])

for col in cols_to_fill:
    data[col] = data[col].fillna(data[col].mean())

data.isna().sum()

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [12]:
for col in data.columns:
    print(col,data[col].unique())
    print('\n')

SepalLengthCm [5.1        4.9        4.7        4.6        5.         5.4
 4.4        4.8        4.3        5.8        5.7        5.85928571
 5.2        5.5        4.5        5.3        7.         6.4
 6.9        6.5        6.3        6.6        5.9        6.
 5.6        6.7        6.2        6.1        6.8        7.1
 7.6        7.3        7.2        7.7        7.4        7.9       ]


SepalWidthCm [3.5        3.         3.2        3.1        3.6        3.9
 3.4        2.9        3.7        4.         4.4        3.8
 3.3        3.05694444 4.1        4.2        2.3        2.8
 2.4        2.7        2.         2.2        2.5        2.6       ]


PetalLengthCm [1.4        1.3        1.5        1.7        3.75602837 1.6
 1.9        4.7        4.5        4.         4.6        3.3
 3.9        3.5        4.2        3.6        4.4        4.8
 4.9        4.3        5.         3.8        3.7        5.1
 4.1        3.         6.         5.9        5.6        5.8
 6.6        6.3        6.1       

# Data Transformation

In [13]:
#LabelEncoding used for encoding categorical labels into numerical representations.

le = LabelEncoder()
le.fit(data['Species'])
le.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [14]:
data['Species'] = le.transform(data['Species'])
data

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.500000,1.400000,0.2,0
1,4.9,3.000000,1.400000,0.2,0
2,4.7,3.200000,1.300000,0.2,0
3,4.6,3.100000,1.500000,0.2,0
4,5.0,3.600000,1.400000,0.2,0
...,...,...,...,...,...
145,6.7,3.000000,5.200000,2.3,2
146,6.3,3.056944,5.000000,1.9,2
147,6.5,3.000000,5.200000,2.0,2
148,6.2,3.400000,5.400000,2.3,2


In [15]:
# Convert the target variable to numerical values
# df['Species'] = df['Species'].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})

# Error Correction ( Outlier detection and removal )

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  150 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
 4   Species        150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


In [17]:
# The Z-score, also known as the standard score, is a statistical measure that quantifies 
# how many standard deviations a data point is away from the mean of a distribution. 
#z_score = np.abs((data - data.mean())/data.std()): Calculates the Z-score for each data point (data point is a single unit of data within a dataset).
#out = z_score > 3: Creates a boolean mask where True indicates outliers (Z-score greater than the threshold).
#data = data[-out.any(axis=1)]: Filters the dataset to remove rows containing outliers based on the mask.
#return data: Returns the filtered dataset without outliers.

def zscore_remove_outliers(data,threshold=3):
    z_score = np.abs((data - data.mean())/data.std()) 
    out = z_score > threshold
    data = data[-out.any(axis=1)]
    return data

filtered_data = zscore_remove_outliers(data)

In [18]:
filtered_data.shape

(149, 5)

In [19]:
data = filtered_data

# Build Data model using regression and Naïve Bayes methods and compare accuracy of Iris Species Prediction

In [20]:
x = data[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
y = data['Species']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=30,random_state=28)

In [21]:
#StandardScaler is a preprocessing step used in machine learning to standardize features by removing the mean and scaling to unit variance. 

#scaler = StandardScaler()
#x_train = scaler.fit_transform(x_train)
#x_test = scaler.fit_transform(x_test)

In [22]:
logreg_model = LogisticRegression(max_iter=10000)
logreg_model.fit(x_train,y_train)

In [23]:
y_pred_lr = logreg_model.predict(x_test)

In [24]:
logreg_model_accuracy = accuracy_score(y_test,y_pred_lr)
logreg_model_accuracy

0.9666666666666667

In [25]:
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(x_train,y_train)

In [26]:
y_pred_nb = naive_bayes_model.predict(x_test)

In [27]:
naive_bayes_model_accuracy = accuracy_score(y_test,y_pred_nb)
naive_bayes_model_accuracy

0.9333333333333333

# Comparison

In [28]:
print('Logistic Regression Accuracy : {}'.format(logreg_model_accuracy))
print('Gaussian Naive Bayes Accuracy : {}'.format(naive_bayes_model_accuracy))

Logistic Regression Accuracy : 0.9666666666666667
Gaussian Naive Bayes Accuracy : 0.9333333333333333
