In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA

Load and split the data

In [2]:
arff_file = arff.loadarff(
    "/Users/maksimnoskov/Documents/tsi_term2/ml_pa_project/Dry_Bean_Dataset.arff"
)
df = pd.DataFrame(arff_file[0])

In [3]:
df.rename(columns={"AspectRation": "AspectRatio"}, inplace=True)
df["Class"] = df["Class"].str.decode("utf-8")

In [4]:
labels = df["Class"]
features = df.drop("Class", axis=1)

In [5]:
train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, test_size=0.20, random_state=42, stratify=labels
)

train_features, val_features, train_labels, val_labels = train_test_split(
    train_features, train_labels, test_size=0.10, random_state=42, stratify=train_labels
)

Standartize data and apply PCA

In [6]:
pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=0.95)),
    ]
)

pipeline.fit(train_features)

pca = pipeline.named_steps["pca"]
explained_variance_ratio = pca.explained_variance_ratio_
for index, variance in enumerate(explained_variance_ratio):
    print(f"Principal Component {index + 1}: {variance:.4f} ({variance * 100:.2f}%)")

pca.n_components_

pca_train_features = pipeline.transform(train_features)
pca_val_features = pipeline.transform(val_features)
pca_test_features = pipeline.transform(test_features)

Principal Component 1: 0.5538 (55.38%)
Principal Component 2: 0.2647 (26.47%)
Principal Component 3: 0.0809 (8.09%)
Principal Component 4: 0.0510 (5.10%)


Train models on Principal Components and validate on the validation set

In [7]:
# Train SVM, DT and KNN models

svm_clf = SVC()
dt_clf = DecisionTreeClassifier(random_state=42)
knn_clf = KNeighborsClassifier()

svm_clf.fit(pca_train_features, train_labels)
dt_clf.fit(pca_train_features, train_labels)
knn_clf.fit(pca_train_features, train_labels)

# Predictions for SVM, DT and KNN

svm_predictions = svm_clf.predict(pca_val_features)
dt_predictions = dt_clf.predict(pca_val_features)
knn_predictions = knn_clf.predict(pca_val_features)

print("SVM Accuracy:", accuracy_score(val_labels, svm_predictions))
print("DT Accuracy:", accuracy_score(val_labels, dt_predictions))
print("KNN Accuracy:", accuracy_score(val_labels, knn_predictions))

# Detailed classification report
print("\nClassification Report for SVM:")
print(classification_report(val_labels, svm_predictions, digits=4))

print("Classification Report for DT:")
print(classification_report(val_labels, dt_predictions, digits=4))

print("Classification Report for KNN:")
print(classification_report(val_labels, knn_predictions, digits=4))

SVM Accuracy: 0.8962350780532599
DT Accuracy: 0.8539944903581267
KNN Accuracy: 0.8925619834710744

Classification Report for SVM:
              precision    recall  f1-score   support

    BARBUNYA     0.8256    0.6698    0.7396       106
      BOMBAY     1.0000    1.0000    1.0000        42
        CALI     0.7883    0.8308    0.8090       130
    DERMASON     0.9135    0.9296    0.9215       284
       HOROZ     0.9437    0.9805    0.9618       154
       SEKER     0.9752    0.9691    0.9721       162
        SIRA     0.8551    0.8673    0.8612       211

    accuracy                         0.8962      1089
   macro avg     0.9002    0.8924    0.8950      1089
weighted avg     0.8955    0.8962    0.8949      1089

Classification Report for DT:
              precision    recall  f1-score   support

    BARBUNYA     0.6600    0.6226    0.6408       106
      BOMBAY     1.0000    0.9762    0.9880        42
        CALI     0.7500    0.7385    0.7442       130
    DERMASON     0.9022   

In [8]:
# Train RF and MLP

rf_clf = RandomForestClassifier(random_state=42)
mlp_clf = MLPClassifier(max_iter=500, random_state=42)

rf_clf.fit(pca_train_features, train_labels)
mlp_clf.fit(pca_train_features, train_labels)

# Predictions for RF and MLP

rf_predictions = rf_clf.predict(pca_val_features)
mlp_predictions = mlp_clf.predict(pca_val_features)

print("RF Accuracy:", accuracy_score(val_labels, rf_predictions))
print("MLP Accuracy:", accuracy_score(val_labels, mlp_predictions))

# Detailed classification reports
print("\nClassification Report for RF:")
print(classification_report(val_labels, rf_predictions, digits=4))

print("Classification Report for MLP:")
print(classification_report(val_labels, mlp_predictions, digits=4))

RF Accuracy: 0.8962350780532599
MLP Accuracy: 0.9008264462809917

Classification Report for RF:
              precision    recall  f1-score   support

    BARBUNYA     0.8105    0.7264    0.7662       106
      BOMBAY     1.0000    1.0000    1.0000        42
        CALI     0.8195    0.8385    0.8289       130
    DERMASON     0.9034    0.9225    0.9129       284
       HOROZ     0.9551    0.9675    0.9613       154
       SEKER     0.9634    0.9753    0.9693       162
        SIRA     0.8565    0.8483    0.8524       211

    accuracy                         0.8962      1089
   macro avg     0.9012    0.8969    0.8987      1089
weighted avg     0.8952    0.8962    0.8955      1089

Classification Report for MLP:
              precision    recall  f1-score   support

    BARBUNYA     0.8172    0.7170    0.7638       106
      BOMBAY     1.0000    1.0000    1.0000        42
        CALI     0.8182    0.8308    0.8244       130
    DERMASON     0.9223    0.9190    0.9206       284
     