# Iris Model Training
**Goal:** 

## 1. Imports and Setup

In [1]:
# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Model training and evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Saving model
import joblib


## 2. Load and Split Data

In [2]:
# Load dataset
df = pd.read_csv('../data/Iris.csv')

# Features and target
features = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
target = 'Species'

X = df[features]
y = df[target]

# Train-validation-test split
X_train, X, y_train, y = train_test_split(
    X, y, test_size = 0.4, random_state = 42, stratify=y )
X_val, X_test, y_val, y_test = train_test_split(
    X, y, test_size = 0.5, random_state= 42, stratify = y) 

print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}")

Training set: (90, 4), Validation set: (30, 4), Test set: (30, 4)


## 3. Models Training & Evaluation

In [3]:
models = {
    'Logistic Regression': LogisticRegression(max_iter = 200),
    'Decision Tree': DecisionTreeClassifier(random_state = 42),
    'Random Forest': RandomForestClassifier(random_state = 42),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

results = pd.DataFrame(columns=['Model', 'Validation Accuracy', 'Test Accuracy'])
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions_val = model.predict(X_val)
    predictions_test = model.predict(X_test)
    val_accuracy = accuracy_score(y_val, predictions_val)
    test_accuracy = accuracy_score(y_test, predictions_test)
    results.loc[len(results)] = [name, val_accuracy, test_accuracy]

print(results)

                 Model  Validation Accuracy  Test Accuracy
0  Logistic Regression             0.933333       0.966667
1        Decision Tree             0.966667       0.933333
2        Random Forest             0.933333       0.900000
3                  SVM             0.933333       1.000000
4                  KNN             0.966667       0.900000


## 4. Save Model

In [4]:
# Choose the best model based on validation accuracy
best_model_name = results.loc[results['Validation Accuracy'].idxmax(), 'Model']
best_model = models[best_model_name]

# Save the best model
path = f'../models/{best_model_name.replace(' ', '_').lower()}_iris_model.pkl'
joblib.dump(best_model, path)
print(f"Model saved as {best_model_name.replace(' ', '_').lower()}_iris_model.pkl")

Model saved as decision_tree_iris_model.pkl
