In [2]:
# Read the data import libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
path = r'G:\3. Machine Learning-24\Iris Classification-24\data\cleaned_data\iris_original.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,soil_type
0,5.16,3.41,1.64,0.26,setosa,sandy
1,5.48,4.05,1.53,0.37,setosa,clay
2,5.1,2.8,1.47,0.38,setosa,sandy
3,4.64,3.44,1.53,0.17,setosa,clay
4,4.85,2.87,1.23,0.26,setosa,loamy


In [4]:
X = df.drop('species', axis =1)
y= df['species']

In [5]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state= 42)

cat_features = ['soil_type']
num_features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

preprocessor = ColumnTransformer(transformers=[
      ('cat', OrdinalEncoder(),cat_features),
      ('num', StandardScaler(),num_features)
])

# Model Building

In [7]:
from sklearn.linear_model import LogisticRegression

lr_pipeline = Pipeline([
      ('preprocessor', preprocessor),
      ('classifier', LogisticRegression())
      ])
lr_pipeline.fit(X_train, y_train)

# Evaluation

In [16]:
y_pred_train_lr = lr_pipeline.predict(X_train)
y_pred_test_lr = lr_pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy_train = accuracy_score(y_train, y_pred_train_lr)
accuracy_test = accuracy_score(y_test, y_pred_test_lr)

print(f"Accuracy_train_LR:{accuracy_train} \n Accuracy_test_LR:{accuracy_test}")

cm =confusion_matrix(y_test, y_pred_test_lr)
print(f"Confusion Matrix:\n{cm}")

classification_report_lr = classification_report(y_test, y_pred_test_lr)
print(f"Clasification Report:\n{classification_report_lr}")

from sklearn.model_selection import cross_val_score, StratifiedKFold


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Compute cross-validated scores
cross_val_scores = cross_val_score(lr_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the mean and standard deviation of cross-validated scores
print("Cross-validated Accuracy: {:.2f} (+/- {:.2f})".format(cross_val_scores.mean(), cross_val_scores.std()))

Accuracy_train_LR:0.9583333333333334 
 Accuracy_test_LR:0.9888888888888889
Confusion Matrix:
[[128   0   0]
 [  0 113   3]
 [  0   1 115]]
Clasification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00       128
  versicolor       0.99      0.97      0.98       116
   virginica       0.97      0.99      0.98       116

    accuracy                           0.99       360
   macro avg       0.99      0.99      0.99       360
weighted avg       0.99      0.99      0.99       360

Cross-validated Accuracy: 0.97 (+/- 0.01)


# Model Evaluation

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Define a list of models to evaluate
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("SVM", SVC()),
    ("k-NN", KNeighborsClassifier())
]

# Iterate over each model, construct a pipeline, and evaluate accuracy
best_model = None
best_accuracy = -1

for model_name, model in models:
    # Construct a pipeline with preprocessing and the model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Train the pipeline
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"Model: {model_name}")
    print(f"Accuracy Score: {accuracy}")
    print()
    
    # Check if this model has the highest accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model_name

print(f"Best model based on accuracy: {best_model}")

Model: Logistic Regression
Accuracy Score: 0.9888888888888889

Model: Decision Tree
Accuracy Score: 0.9444444444444444

Model: Random Forest
Accuracy Score: 0.9777777777777777

Model: SVM
Accuracy Score: 0.9861111111111112

Model: k-NN
Accuracy Score: 0.9777777777777777

Best model based on accuracy: Logistic Regression


# SVM

In [10]:
from sklearn.svm import SVC

svc_pipeline = Pipeline([
      ('preprocessor', preprocessor),
      ('classifier',  SVC())
      ])
svc_pipeline.fit(X_train, y_train)

In [11]:
y_pred_train_svc = svc_pipeline.predict(X_train)
y_pred_test_svc = svc_pipeline.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy_train = accuracy_score(y_train, y_pred_train_svc)
accuracy_test = accuracy_score(y_test, y_pred_test_svc)

print(f"Accuracy_train_SVC:{accuracy_train} \n Accuracy_test_SVC:{accuracy_test}")

cm =confusion_matrix(y_test, y_pred_test_svc)
print(f"Confusion Matrix:\n{cm}")

classification_report_svc = classification_report(y_test, y_pred_test_svc)
print(f"Clasification Report:\n{classification_report_svc}")

from sklearn.model_selection import cross_val_score, StratifiedKFold


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Compute cross-validated scores
cross_val_scores = cross_val_score(svc_pipeline, X, y, cv=cv, scoring='accuracy')

# Print the mean and standard deviation of cross-validated scores
print("Cross-validated Accuracy: {:.2f} (+/- {:.2f})".format(cross_val_scores.mean(), cross_val_scores.std()))

Accuracy_train_SVC:0.9619047619047619 
 Accuracy_test_SVC:0.9861111111111112
Confusion Matrix:
[[128   0   0]
 [  0 113   3]
 [  0   2 114]]
Clasification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00       128
  versicolor       0.98      0.97      0.98       116
   virginica       0.97      0.98      0.98       116

    accuracy                           0.99       360
   macro avg       0.99      0.99      0.99       360
weighted avg       0.99      0.99      0.99       360

Cross-validated Accuracy: 0.96 (+/- 0.02)


Finally Logistic Regression model is the best fit for this problem.

# Predicting system

In [12]:
import pandas as pd

# Define input data
input_data = [[8, 7, 6, 4, 'clay']]  # Example input data point

# Convert input data to DataFrame
input_df = pd.DataFrame(input_data, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'soil_type'])

# Make predictions
prediction = lr_pipeline.predict(input_df)
print("Iris flower is:", prediction)


Iris flower is: ['virginica']
