Environment set up 

In [1]:
# Import required libraries
import pandas as pd # type: ignore
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Models to try
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Set style for plots
sns.set_style('whitegrid')
%matplotlib inline

ModuleNotFoundError: No module named 'pandas'

Loading and data Exploratory 

In [2]:
# Load the dataset
df = pd.read_csv(r'C:\Users\kanzi\Downloads\Iris (1).csv')

# Display first 5 rows
df.head()


NameError: name 'pd' is not defined

In [3]:
# Basic information about the dataset
df.info()

NameError: name 'df' is not defined

In [None]:
# Statistical summary
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Count of each species
df['Species'].value_counts()

Data Visualization

In [None]:
# Pairplot to see relationships between features
sns.pairplot(df, hue='Species')
plt.show()

In [None]:
# Boxplots for each feature by species
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
sns.boxplot(x='Species', y='SepalLengthCm', data=df)
plt.subplot(2, 2, 2)
sns.boxplot(x='Species', y='SepalWidthCm', data=df)
plt.subplot(2, 2, 3)
sns.boxplot(x='Species', y='PetalLengthCm', data=df)
plt.subplot(2, 2, 4)
sns.boxplot(x='Species', y='PetalWidthCm', data=df)
plt.show()

DATA PREPOCESSING

In [5]:
# Drop the Id column as it's not needed
df = df.drop('Id', axis=1)

# Encode the species labels
encoder = LabelEncoder()
df['Species'] = encoder.fit_transform(df['Species'])

# Separate features and target
X = df.drop('Species', axis=1)
y = df['Species']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

NameError: name 'df' is not defined

Model Training and Evaluation

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC()
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n" + "="*50 + "\n")

Model Comparison and Selection

In [None]:
# Compare model performances
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])
results_df = results_df.sort_values('Accuracy', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Accuracy', y='Model', hue='Model', data=results_df, palette='viridis', legend=False)
plt.title('Model Accuracy Comparison')
plt.xlim(0.9, 1.0)
plt.show()

Feature Importance (for tree-based models)

In [None]:
# Feature importance for Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

importances = rf.feature_importances_
features = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x='Accuracy', y='Model', hue='Model', data=results_df, palette='viridis', legend=False)
plt.title('Feature Importance')
plt.show()

Final Model Selection and Testing

In [None]:
# Select the best model (Random Forest in this case)
best_model = RandomForestClassifier(random_state=42)
best_model.fit(X_train, y_train)

# Final evaluation
final_pred = best_model.predict(X_test)
print(f"Final Model Accuracy: {accuracy_score(y_test, final_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, final_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, final_pred))

Saving the Model

In [None]:
import joblib

# Save the model
joblib.dump(best_model, 'iris_classifier.pkl')

# Save the encoder
joblib.dump(encoder, 'label_encoder.pkl')

Creating a Simple Prediction Function

In [None]:
def predict_iris(sepal_length, sepal_width, petal_length, petal_width):
    # Load the model and encoder
    model = joblib.load('iris_classifier.pkl')
    encoder = joblib.load('label_encoder.pkl')
    
    # Create a dataframe with the input
    input_data = pd.DataFrame([[sepal_length, sepal_width, petal_length, petal_width]],
                            columns=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'])
    
    # Make prediction
    prediction = model.predict(input_data)
    
    # Decode the prediction
    species = encoder.inverse_transform(prediction)
    
    return species[0]

# Example usage
print(predict_iris(5.1, 3.5, 1.4, 0.2))  # Should predict Iris-setosa
print(predict_iris(6.3, 3.3, 6.0, 2.5))  # Should predict Iris-virginica