# **Introduction**

 📆 Today I am gonna work on one of the most famous dataset **Iris dataset** 😀. So we'll first take a look at the dataset, than will do KDE and based on the patterns we'll do Data Preprocessing. And finally in end we'll make models and test them. 🚀


 **About Dataset**


 its a pretty simple dataset with **5 columns 💎** and **150 rows 🔺**



1.   	sepal length (cm) ⚡
2.   sepal width (cm) ⚡
1.   petal length (cm) ⚡
2.   petal width (cm) ⚡
5. target ⚡








In [None]:
# Importing Libraries
from sklearn.datasets import load_iris
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# loading a built it dataset  and converting it to pandas DataFrame
iris = load_iris()

df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# checking null values
df.isnull().sum()

In [None]:
# checking correlation
df.corr()

In [None]:
# unique values
df.nunique()

# **KDE**

In [None]:
# scatter plot
sns.scatterplot(x='sepal length (cm)', y='sepal width (cm)', data=df, hue='target')

In [None]:
# another scatter plot
sns.scatterplot(x='petal length (cm)', y='petal width (cm)', data=df, hue='target')

In [None]:
# pair plot
sns.pairplot(df, hue='target')

In [None]:
# count plot
sns.countplot(x='target', data=df, palette='Set2')

In [None]:
# box plot
sns.boxplot(x='target', y='sepal length (cm)', data=df, palette='Set1')

In [None]:
# and again box plot
sns.boxplot(x='target', y='sepal width (cm)', data=df, palette='Set1')

In [None]:
# and again and again box plot
sns.boxplot(x='target', y='petal length (cm)', data=df, palette='Set2')

In [None]:
# and one last time BOX plot
sns.boxplot(x='target', y='petal width (cm)', data=df, palette='Set2')

In [None]:
# heatmap for cheacking correlation
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')

# **Data Preprocessing**

In [None]:
# defining X and y
X = df.drop('target', axis=1)
y = df['target']

In [None]:
# applying train test splot
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scaling the dataset
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
# scaled dataset
X_train_scaled_df

# **Model Training**

In [None]:
# training Logsitic Regression and applying GridSearchCV to find the best parameters
log_reg = LogisticRegression()
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 10, 100],
    'penalty':['l1', 'l2'],
    'solver':['liblinear', 'saga']
}

grid_search_lr = GridSearchCV(log_reg, param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train_scaled_df, y_train)

print("Best parameters for Logistic Regression:", grid_search_lr.best_params_)
print("Best cross-validation accuracy:", grid_search_lr.best_score_)

In [None]:
# training Decision Tree and applying GridSearchCV to find the best parameters
dt = DecisionTreeClassifier()
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion':['gini', 'entropy'],
    'max_features':['auto', 'sqrt', 'log2', None]
}

grid_search_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train_scaled_df, y_train)

print('Best parameters for Decision Tree:', grid_search_dt.best_params_)
print('Best cross-validation accuracy:', grid_search_dt.best_score_)

In [None]:
# training Random Forest and applying RandomizedSearchCV to find the best parameters
rf = RandomForestClassifier()
param_grid_rf = {
    'n_estimators':[100, 200, 300],
    'max_depth':[None, 5, 10, 15],
    'min_samples_split':[2, 5, 10],
    'min_samples_leaf':[1,2,4],
    'max_features':['auto', 'sqrt', 'log2'],
    'bootstrap':[True, False]
}

rand_search_rf = RandomizedSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
rand_search_rf.fit(X_train_scaled_df, y_train)

print('Best parameters Random Forest:', rand_search_rf.best_params_)
print('Best cross-validation accuracy:', rand_search_rf.best_score_)

In [None]:
# training SVM and applying GridSearchCV to find the best parameters
svm = SVC()
param_grid_svm = {
    'C':[0.1, 1, 10, 100],
    'kernel':['linear', 'rbf', 'poly','sigmoid'],
    'gamma':['scale', 'rbf', 'poly', 'sigmoid'],
    'degree':[2,3,4]
}

grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train_scaled_df, y_train)

print('Best parameters for SVM:', grid_search_svm.best_params_)
print('Best cross-validation accuracy:', grid_search_svm.best_score_)

# **Model Evaluation**

In [None]:
# testing the models

# defining the best models
best_log_reg = grid_search_lr.best_estimator_
best_dt = grid_search_dt.best_estimator_
best_rf = rand_search_rf.best_estimator_
best_svm = grid_search_svm.best_estimator_

# making predictions on test data
y_pred_lr = best_log_reg.predict(X_test_scaled_df)
y_pred_dt = best_dt.predict(X_test_scaled_df)
y_pred_rf = best_rf.predict(X_test_scaled_df)
y_pred_svm = best_svm.predict(X_test_scaled_df)

# Results for Logistic Regression
print("\n---- Logistic Regression Evaluation ----")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

# Results for Decision Tree
print("\n--- Decision Tree Evaluation ---")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

# Results for Random Forest
print("\n--- Random Forest Evaluation ---")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# Results for SVM
print("\n--- SVM Evaluation ---")
print("Accuracy:\n", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

# **Conclusion**
So as we can see the models are performing pretty well and giving full accuracy except SVM which is slightly worse than other models.
Thanks for reading my notebook 😼