In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
iris_df = pd.read_csv("iris.csv")

In [3]:
iris_df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [5]:
from sklearn.preprocessing import LabelEncoder
iris_df["Species"] = iris_df["Species"].map({"Iris-setosa":0, "Iris-versicolor":1, "Iris-virginica":2 }).astype("int64")


In [6]:
iris_df.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,2
146,147,6.3,2.5,5.0,1.9,2
147,148,6.5,3.0,5.2,2.0,2
148,149,6.2,3.4,5.4,2.3,2
149,150,5.9,3.0,5.1,1.8,2


In [27]:
# Train Test Split
X = iris_df.drop(["Id", "Species"], axis=1)
y = iris_df["Species"]

X_train, X_test, y_train, y_test = train_test_split(
    X,y,test_size=0.4, random_state=42
)

In [8]:
y_train[y_train==0].count()
#y_train[y_train==1].count()
#y_train[y_train==2].count()


np.int64(21)

In [28]:
# Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
#print("Precision:", precision_score(y, y_pred, average='weighted'))
#print("Recall:", recall_score(y, y_pred, average='weighted'))

Accuracy: 1.0


In [20]:
# Use Naive Byes
model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("precision Score: ", precision_score(y_test, y_pred))
print("recall Score: ", recall_score(y_test, y_pred))


Accuracy: 0.9933333333333333
Precision: 0.9934640522875816
Recall: 1.0


In [21]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train)

y_pred = neigh.predict(X)

print("Accuracy:", accuracy_score(y, y_pred))
print("Precision:", precision_score(y, y_pred, average='weighted'))
print("Recall:", recall_score(y, y_pred, average='weighted'))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0


In [22]:
# Cross Validation for hyperparam tuning using GridSearchCV

from sklearn.model_selection import GridSearchCV

classifier = KNeighborsClassifier()
param_grid = {"n_neighbors":[3,5,7,9]}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
classifierCV = GridSearchCV(
    classifier,
    param_grid,
    cv=5
    
)

classifierCV.fit(X_train, y_train)
y_pred = classifierCV.predict(X)

print("Accuracy:", accuracy_score(y, y_pred))
print("Precision:", precision_score(y, y_pred, average='weighted'))
print("Recall:", recall_score(y, y_pred, average='weighted'))

#result
res = pd.DataFrame(classifierCV.cv_results_)
print(res[["param_n_neighbors", "mean_test_score"]])

print(classifierCV.best_params_)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
   param_n_neighbors  mean_test_score
0                  3         1.000000
1                  5         0.991667
2                  7         0.983333
3                  9         0.983333
{'n_neighbors': 3}


In [13]:
import pandas as pd
df = pd.read_csv("iris.csv")
df.head()

# convert 3 species into 0, 1, 2
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Species'] = le.fit_transform(df['Species'])

from sklearn.model_selection import train_test_split
X = df.drop('Species', axis=1)
y = df['Species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

models = {'Logistic Regression': lr, 'KNN': knn, 'Naive Bayes': nb}

for name, model in models.items():
    y_pred = model.predict(X)
    print(f"\n{name}")
    print("Accuracy:", accuracy_score(y, y_pred))
    
    print("cm:", confusion_matrix(y, y_pred))

    print("classification report:", classification_report(y, y_pred))



Logistic Regression
Accuracy: 1.0
cm: [[50  0  0]
 [ 0 50  0]
 [ 0  0 50]]
classification report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        50
           2       1.00      1.00      1.00        50

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150


KNN
Accuracy: 0.9933333333333333
cm: [[50  0  0]
 [ 0 49  1]
 [ 0  0 50]]
classification report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      0.98      0.99        50
           2       0.98      1.00      0.99        50

    accuracy                           0.99       150
   macro avg       0.99      0.99      0.99       150
weighted avg       0.99      0.99      0.99       150


Naive Bayes
Accuracy: 0.9933333333333333