In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Load the Iris dataset
iris_data = load_iris()

# Create a DataFrame for easier manipulation
iris_df = pd.DataFrame(data= np.c_[iris_data['data'], iris_data['target']],
                       columns= iris_data['feature_names'] + ['target'])

# Display the first few rows of the dataset
print(iris_df.head())

# Check for missing values
print(iris_df.isnull().sum())

# Check the distribution of target classes
print(iris_df['target'].value_counts())

# Splitting features and target variable
X = iris_df.drop('target', axis=1)
y = iris_df['target']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Encoding categorical variable (target variable)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Display the first few rows of the scaled features
print(pd.DataFrame(X_train_scaled, columns=X.columns).head())


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0     0.0  
1     0.0  
2     0.0  
3     0.0  
4     0.0  
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64
0.0    50
1.0    50
2.0    50
Name: target, dtype: int64
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0          -1.473937          1.203658          -1.562535         -1.312603
1          -0.133071          2.992376          -1.276006         -1.045633
2           1.085898          0.085709  

In [2]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Define a function to train and evaluate SVM models
def train_and_evaluate_svm(kernel, C=1.0, gamma='scale'):
    # Initialize the SVM classifier
    svm_classifier = SVC(kernel=kernel, C=C, gamma=gamma)
    
    # Train the SVM classifier
    svm_classifier.fit(X_train_scaled, y_train_encoded)
    
    # Predict the labels for the test set
    y_pred = svm_classifier.predict(X_test_scaled)
    
    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test_encoded, y_pred)
    
    return accuracy

# List of kernels to try
kernels = ['linear', 'rbf', 'poly']

# Experiment with different kernels and parameters
for kernel in kernels:
    accuracy = train_and_evaluate_svm(kernel)
    print(f'Kernel: {kernel}, Accuracy: {accuracy}')


Kernel: linear, Accuracy: 0.9666666666666667
Kernel: rbf, Accuracy: 1.0
Kernel: poly, Accuracy: 0.9666666666666667


In [3]:
from sklearn.model_selection import cross_val_score, KFold

# Define the number of folds for cross-validation
k_folds = 5

# Initialize KFold object with shuffling
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# List to store cross-validation scores
cv_scores = []

# Perform K-fold cross-validation
for kernel in kernels:
    svm_classifier = SVC(kernel=kernel)
    scores = cross_val_score(svm_classifier, X_train_scaled, y_train_encoded, cv=kf, scoring='accuracy')
    cv_scores.append(scores)

# Display the cross-validation scores
for i, kernel in enumerate(kernels):
    print(f'Kernel: {kernel}, Cross-Validation Accuracy: {np.mean(cv_scores[i])}')


Kernel: linear, Cross-Validation Accuracy: 0.95
Kernel: rbf, Cross-Validation Accuracy: 0.95
Kernel: poly, Cross-Validation Accuracy: 0.9416666666666668


In [4]:
from sklearn.metrics import classification_report

# Define a function to train SVM model and evaluate performance
def train_and_evaluate_svm_with_metrics(kernel, C=1.0, gamma='scale'):
    # Initialize the SVM classifier
    svm_classifier = SVC(kernel=kernel, C=C, gamma=gamma)
    
    # Train the SVM classifier
    svm_classifier.fit(X_train_scaled, y_train_encoded)
    
    # Predict the labels for the test set
    y_pred = svm_classifier.predict(X_test_scaled)
    
    # Calculate evaluation metrics
    report = classification_report(y_test_encoded, y_pred, target_names=iris_data.target_names)
    
    return report

# Evaluate SVM models with different kernels
for kernel in kernels:
    print(f"Kernel: {kernel}")
    report = train_and_evaluate_svm_with_metrics(kernel)
    print(report)
    print("\n")


Kernel: linear
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.89      0.94         9
   virginica       0.92      1.00      0.96        11

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



Kernel: rbf
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



Kernel: poly
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.90      1.00      0.95         9
   virginica       1.00      0.

Approach Documentation: Iris Dataset Classification with Support Vector Machines (SVM)

1. Data Preprocessing Steps:

Loading the Dataset: The dataset used for this task is the Iris dataset, a classic dataset in machine learning. It consists of 150 samples of iris flowers, with each sample containing four features: sepal length, sepal width, petal length, and petal width. The target variable is the species of iris, which can be one of three classes: setosa, versicolor, or virginica.
Handling Missing Values: The Iris dataset is a clean dataset with no missing values, so no explicit handling of missing values is required.
Scaling Features: The features are standardized using the StandardScaler from Scikit-learn to ensure that each feature contributes equally to the model's learning process.
Encoding Categorical Variables: The target variable (species) is encoded using LabelEncoder from Scikit-learn to convert categorical labels into numerical values.
2. SVM Model Configuration:

Kernel Selection: We experiment with three common SVM kernels: linear, radial basis function (RBF), and polynomial.
Model Training: The SVM models are trained using the SVC class from Scikit-learn. We specify the chosen kernel for each model.
Model Evaluation: After training, the models are evaluated using various evaluation metrics such as accuracy, precision, recall, and F1-score.
3. Cross-Validation Procedure:

K-Fold Cross-Validation: We use K-fold cross-validation, a robust technique for model evaluation, to assess the performance of the SVM models.
Number of Folds: We choose a suitable value for K, typically 5 or 10, to balance computational cost and reliability. In this case, we choose 5 folds.
Shuffling: Before partitioning the dataset into folds, we shuffle the dataset to prevent any bias in the selection of folds.
Cross-Validation Scores: We calculate the cross-validation accuracy for each SVM kernel configuration, providing a more reliable estimate of the model's performance.




Conclusion:

By following this approach, we preprocess the Iris dataset, configure SVM models with different kernels, and evaluate their performance using appropriate evaluation metrics and cross-validation techniques.
This documentation provides clarity on the steps involved in the classification task, ensuring transparency and reproducibility of the results.