In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from sklearn import metrics 
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC 

from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 


from sklearn.decomposition import PCA
from sklearn import datasets

In [82]:
from sklearn import datasets 

# Load data with only two classes and two features 
iris = datasets.load_iris()  

In [83]:
X= iris.data
Y= iris.target

In [84]:
scaler = StandardScaler() 
X = scaler.fit_transform(X) 

SVM with original dataset

In [85]:
svc_rbf = SVC(kernel="rbf", random_state=0, gamma=1, C=1) 

# Train the classifier 
model_svc=svc_rbf.fit(X,Y) 
target_pred_rbf = model_svc.predict(X) 
target_pred_rbf

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [86]:
print("The accuracy is "+str(metrics.accuracy_score(Y,target_pred_rbf)*100)+"%") 
print(confusion_matrix(Y,target_pred_rbf))  
target_names = ['class 0', 'class 1', 'class 2'] 
print(classification_report(Y,target_pred_rbf, target_names=target_names)) 

The accuracy is 98.66666666666667%
[[50  0  0]
 [ 0 49  1]
 [ 0  1 49]]
              precision    recall  f1-score   support

     class 0       1.00      1.00      1.00        50
     class 1       0.98      0.98      0.98        50
     class 2       0.98      0.98      0.98        50

    accuracy                           0.99       150
   macro avg       0.99      0.99      0.99       150
weighted avg       0.99      0.99      0.99       150



Logisitic Regression with original dataset

In [87]:

X_train,X_test,Y_train,Y_test = train_test_split(X, Y,test_size=0.20,random_state=0) 

In [88]:

logistic_regression1 = LogisticRegression(solver="liblinear", 
                                         random_state=0) 
# try different values for max_iter and observe the difference in training time

# Train model 
model1 = logistic_regression1.fit(X_train, Y_train) 
#Predictions 
Y_pred1 = model1.predict(X_test) 

In [89]:

print("The accuracy is "+str(metrics.accuracy_score(Y_test,Y_pred1)*100)+"%") 
print(confusion_matrix(Y_test, Y_pred1))  
target_names = ['class 0', 'class 1', 'class 2'] 
print(classification_report(Y_test, Y_pred1, target_names=target_names))

The accuracy is 86.66666666666667%
[[11  0  0]
 [ 0 10  3]
 [ 0  1  5]]
              precision    recall  f1-score   support

     class 0       1.00      1.00      1.00        11
     class 1       0.91      0.77      0.83        13
     class 2       0.62      0.83      0.71         6

    accuracy                           0.87        30
   macro avg       0.84      0.87      0.85        30
weighted avg       0.89      0.87      0.87        30



SVM with new dataset

In [90]:

pca = PCA(n_components=2, whiten=True)  

In [91]:

# Conduct PCA 

X_pca = pca.fit_transform(X) 
X_test_pca = pca.fit_transform(X_test) 
X_train_pca = pca.fit_transform(X_train) 

In [92]:
# Show results 
print("Original number of features:", X.shape[1]) 
print("Reduced number of features:", X_pca.shape[1])

Original number of features: 4
Reduced number of features: 2


In [93]:
print("PCA components:\n{}".format(pca.components_)) 

PCA components:
[[ 0.53658514 -0.21981024  0.58074534  0.57139715]
 [ 0.35735544  0.93351429 -0.00506711  0.02867897]]


In [94]:
pca.explained_variance_ratio_ 

array([0.72930408, 0.23213204])

In [95]:
svc_rbf = SVC(kernel="rbf", random_state=0, gamma=1, C=1) 

# Train the classifier 
model_svc=svc_rbf.fit(X_pca,Y) 
target_pred_rbf_pca = model_svc.predict(X_test_pca) 
target_pred_rbf_pca

array([2, 1, 0, 2, 0, 2, 0, 2, 2, 1, 2, 1, 2, 2, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0])

In [97]:
print("The accuracy is "+str(metrics.accuracy_score(Y_test,target_pred_rbf_pca)*100)+"%") 
print(confusion_matrix(Y_test,target_pred_rbf_pca))  
target_names = ['class 0', 'class 1', 'class 2'] 
print(classification_report(Y_test,target_pred_rbf_pca, target_names=target_names)) 

The accuracy is 86.66666666666667%
[[11  0  0]
 [ 0  9  4]
 [ 0  0  6]]
              precision    recall  f1-score   support

     class 0       1.00      1.00      1.00        11
     class 1       1.00      0.69      0.82        13
     class 2       0.60      1.00      0.75         6

    accuracy                           0.87        30
   macro avg       0.87      0.90      0.86        30
weighted avg       0.92      0.87      0.87        30



In [None]:

logistic_regression1 = LogisticRegression(solver="liblinear", 
                                         random_state=0) 
# try different values for max_iter and observe the difference in training time

# Train model 
model_lr_pca= logistic_regression1.fit(X_train_pca, Y_train) 
#Predictions 
Y_pred_lr_pca = model_lr_pca.predict(X_test_pca) 

In [None]:

print("The accuracy is "+str(metrics.accuracy_score(Y_test,Y_pred_lr_pca)*100)+"%") 
print(confusion_matrix(Y_test, Y_pred_lr_pca))  
target_names = ['class 0', 'class 1', 'class 2'] 
print(classification_report(Y_test, Y_pred_lr_pca, target_names=target_names))

The accuracy is 63.33333333333333%
[[10  1  0]
 [ 0  3 10]
 [ 0  0  6]]
              precision    recall  f1-score   support

     class 0       1.00      0.91      0.95        11
     class 1       0.75      0.23      0.35        13
     class 2       0.38      1.00      0.55         6

    accuracy                           0.63        30
   macro avg       0.71      0.71      0.62        30
weighted avg       0.77      0.63      0.61        30



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve

# ======================
# 1. Load Dataset
# ======================
# Example for iris dataset:
# data = datasets.load_iris()
# For wine or digits, replace with:
# data = datasets.load_wine()
# data = datasets.load_digits()

data = datasets.load_iris()  # Change this according to question

X = data.data
Y = data.target

# ======================
# 2. Standardize Features
# ======================
scaler = StandardScaler()
X = scaler.fit_transform(X)

# ======================
# 3. Split Data
# ======================
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# ======================
# 4. Define Models
# ======================
models = {
    'SVM (RBF Kernel)': SVC(kernel='rbf', probability=True, random_state=0, gamma=1, C=1),
    'Logistic Regression': LogisticRegression(solver='liblinear', random_state=0),
    'Random Forest': RandomForestClassifier(random_state=0)
}

# ======================
# 5. Train & Evaluate Models on Original Data
# ======================
print("=== Performance on Original Data ===")
for name, model in models.items():
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    
    print(f"\nModel: {name}")
    print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred))
    print("Recall:", metrics.recall_score(Y_test, Y_pred, average='macro'))
    print("Precision:", metrics.precision_score(Y_test, Y_pred, average='macro'))
    print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_pred))
    print("Classification Report:\n", classification_report(Y_test, Y_pred))
    
    if hasattr(model, "predict_proba"):
        Y_prob = model.predict_proba(X_test)
        fpr, tpr, _ = roc_curve(Y_test, Y_prob[:, 1], pos_label=1)
        auc = roc_auc_score(Y_test, Y_prob, multi_class='ovr')
        print("ROC AUC Score:", auc)

# ======================
# 6. Apply PCA
# ======================

# Choose different n_components list based on question:
n_components_list = [2]  # For Iris (Q3)
# n_components_list = [0.9, 0.95, 0.99, 2, 5, 8, 10]  # For Wine (Q6)
# n_components_list = [0.9, 0.95, 0.99, 10, 30, 40, 50]  # For Digits (Q5)

for n in n_components_list:
    print(f"\n=== PCA with n_components={n} ===")
    pca = PCA(n_components=n, whiten=True)
    X_pca = pca.fit_transform(X)
    X_train_pca, X_test_pca, Y_train_pca, Y_test_pca = train_test_split(X_pca, Y, test_size=0.2, random_state=0)
    
    print("Variance explained by each component:", pca.explained_variance_ratio_)
    print("Total variance explained:", np.sum(pca.explained_variance_ratio_))

    for name, model in models.items():
        model.fit(X_train_pca, Y_train_pca)
        Y_pred_pca = model.predict(X_test_pca)
        
        print(f"\nModel: {name}")
        print("Accuracy:", metrics.accuracy_score(Y_test_pca, Y_pred_pca))
        print("Recall:", metrics.recall_score(Y_test_pca, Y_pred_pca, average='macro'))
        print("Precision:", metrics.precision_score(Y_test_pca, Y_pred_pca, average='macro'))
        print("Confusion Matrix:\n", confusion_matrix(Y_test_pca, Y_pred_pca))
        print("Classification Report:\n", classification_report(Y_test_pca, Y_pred_pca))
        
        if hasattr(model, "predict_proba"):
            Y_prob = model.predict_proba(X_test_pca)
            fpr, tpr, _ = roc_curve(Y_test_pca, Y_prob[:, 1], pos_label=1)
            auc = roc_auc_score(Y_test_pca, Y_prob, multi_class='ovr')
            print("ROC AUC Score:", auc)



=== Performance on Original Data ===

Model: SVM (RBF Kernel)
Accuracy: 1.0
Recall: 1.0
Precision: 1.0
Confusion Matrix:
 [[11  0  0]
 [ 0 13  0]
 [ 0  0  6]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00         6

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

ROC AUC Score: 1.0

Model: Logistic Regression
Accuracy: 0.8666666666666667
Recall: 0.8675213675213675
Precision: 0.8446969696969697
Confusion Matrix:
 [[11  0  0]
 [ 0 10  3]
 [ 0  1  5]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.91      0.77      0.83        13
           2       0.62      0.83      0.71         6

    a