In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import pandas as pd

# Load the Iris dataset
iris_df = pd.read_csv('https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv')

# Display the first few rows of the dataset
print(iris_df.head())

# Split the dataset into features (X and target labels (y))
X = iris_df.drop('species', axis=1) # Features  (sepal length, sepal width, petal length, petal width)
y = iris_df['species'] # Target labels (species)

# Display the shape of features and target labels
print("Features shape:", X.shape)
print("Target labels shape:", y.shape)

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa
Features shape: (150, 4)
Target labels shape: (150,)


In [4]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the features and transform them
X_scaled = scaler.fit_transform(X)

# Display the scaled features 
print("Scaled features:")
print(X_scaled[:5]) # Display the first 5 rows of scaled features

Scaled features:
[[-0.90068117  1.03205722 -1.3412724  -1.31297673]
 [-1.14301691 -0.1249576  -1.3412724  -1.31297673]
 [-1.38535265  0.33784833 -1.39813811 -1.31297673]
 [-1.50652052  0.10644536 -1.2844067  -1.31297673]
 [-1.02184904  1.26346019 -1.3412724  -1.31297673]]


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model 
log_reg = LogisticRegression(max_iter=1000)

# Fit the model to the training data
log_reg.fit(X_train, y_train)

In [8]:
from sklearn.metrics import classification_report

# Make predictions on the  test data
y_pred = log_reg.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [11]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the  best hyperparameters
best_param = grid_search.best_params_
print("Best Hyperparameters:", best_param)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)
print("\nClassification Report (Best Model):")
print(classification_report(y_test, y_pred_best))

Best Hyperparameters: {'C': 1}

Classification Report (Best Model):
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [12]:
# Example of new, unseen data
new_data = [[5.1, 3.5, 1.4, 0.2],  # Sample 1
           [6.3, 2.9, 5.6, 1.8],   # Sample 2
           [7.2, 3.6, 6.1, 2.5]]   # Sample 3

# Scale the new data using the same scaler
new_data_scaled =scaler.transform(new_data)

# Make predictions on the new data 
predictions = best_model.predict(new_data_scaled)

# Display the predictions 
for i, pred in enumerate(predictions):
    print(f"Sample {i+1} prediction:", pred)

Sample 1 prediction: setosa
Sample 2 prediction: virginica
Sample 3 prediction: virginica




In this project, we successfully built and evaluated a machine learning model to classify iris flowers into their respective species based on their sepal and petal dimensions. Here's a summary of what we accomplished:

**Data Preparation**: We loaded the Iris dataset, split it into features and target labels, and scaled the features using standardization.

**Model Selection**: We chose logistic regression as our machine learning algorithm for classification.

**Model Training and Evaluation**: We trained the logistic regression model on the training data and evaluated its performance using the test data. We used metrics such as accuracy, precision, recall, and F1-score to evaluate the model's performance.

**Model Tuning (Optional)**: We performed hyperparameter tuning for the logistic regression model using cross-validation to find the optimal value of the regularization parameter.

**Prediction**: We used the trained logistic regression model to make predictions on new, unseen data and obtained predictions for the provided samples.

Overall, the logistic regression model performed well in classifying iris flowers into their respective species, achieving high accuracy and other favorable metrics. However, there may be room for further improvement through additional feature engineering, exploring different algorithms, or collecting more data.