In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import os

# Load the Dataset

In [None]:
import kagglehub
path = kagglehub.dataset_download("arshid/iris-flower-dataset")
print("Path to dataset files:", path)


In [None]:
print(os.listdir(path))

# Read

In [None]:
df = pd.read_csv("C:/Users/Admin/.cache/kagglehub/datasets/arshid/iris-flower-dataset/versions/1/IRIS.csv")
df.head()


# Data Info and Clean-Up

In [None]:
# Basic info about the dataset
print("Dataset Information:")
print(df.info())

In [None]:
# Summary statistics
print("\nSummary Statistics:")
print(df.describe())

In [None]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Display column names
print("\nColumn Names:")
print(df.columns)

# EDA – Visualizations

### Countplot for Species Distribution

In [None]:
sns.countplot(x='species', data=df)
plt.title('Distribution of Iris Species')
plt.show()


###  Pairplot for Feature Relationships

In [None]:
sns.pairplot(df, hue='species')
plt.show()

### Correlation Heatmap

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()


# Data Preparation

In [None]:
# Features (all columns except 'species') and target
X = df.drop('species', axis=1)
y = df['species']

# Encode the target labels (if not already numeric)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Display the unique classes for verification
print("Unique Classes:", le.classes_)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Model Training

### Random Forest Classifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


### Logistic Regression

In [None]:
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)


# Model Evaluation

### Random Forest Evaluation

In [None]:
print("Random Forest Classifier Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))


### Logistic Regression Evaluation

In [None]:
print("\nLogistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))


## Model Comparison

In [None]:
print("Model Comparison:")
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))


# Visualization

In [None]:
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Random Forest Confusion Matrix')
plt.show()


In [None]:
feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_}).sort_values(by='importance', ascending=False)
print(feature_importances)

# Plot feature importances
sns.barplot(x='importance', y='feature', data=feature_importances)
plt.title("Feature Importances from Random Forest")
plt.show()
