In [142]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report
)

In [144]:
file_path = "Dry_Bean_Dataset.xlsx"

df = pd.read_excel(file_path)  

print("Shape:", df.shape)
display(df.head())

Shape: (13611, 17)


Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.27275,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,SEKER
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,SEKER


In [145]:
print("Missing values per column:\n")
print(df.isna().sum())

if "Class" in df.columns:
    print("\nClass distribution:\n")
    print(df["Class"].value_counts())
else:
    print("\nERROR: Column 'Class' not found. Check your column name above.")

Missing values per column:

Area               0
Perimeter          0
MajorAxisLength    0
MinorAxisLength    0
AspectRation       0
Eccentricity       0
ConvexArea         0
EquivDiameter      0
Extent             0
Solidity           0
roundness          0
Compactness        0
ShapeFactor1       0
ShapeFactor2       0
ShapeFactor3       0
ShapeFactor4       0
Class              0
dtype: int64

Class distribution:

Class
DERMASON    3546
SIRA        2636
SEKER       2027
HOROZ       1928
CALI        1630
BARBUNYA    1322
BOMBAY       522
Name: count, dtype: int64


In [146]:
X = df.drop(columns=["Class"])
y = df["Class"]

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (13611, 16)
y shape: (13611,)


In [150]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Classes found:", list(label_encoder.classes_))
print("Example encoded labels:", y_encoded[:10])

Classes found: ['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']
Example encoded labels: [5 5 5 5 5 5 5 5 5 5]


In [152]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

Train size: 10888
Test size: 2723


In [154]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaled train shape:", X_train_scaled.shape)

Scaled train shape: (10888, 16)


In [156]:
models = {
    "Decision Tree": DecisionTreeClassifier(
        criterion="gini",
        max_depth=15,
        min_samples_split=10,
        min_samples_leaf=5,
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        criterion="gini",
        max_depth=20,
        min_samples_split=5,
        max_features="sqrt",
        bootstrap=True,
        oob_score=True,
        n_jobs=-1,
        random_state=42
    ),
    "KNN": KNeighborsClassifier(
        n_neighbors=5,
        weights="distance",
        metric="euclidean",
        n_jobs=-1
    )
}

In [170]:
from sklearn.metrics import classification_report

results = {}

for name, model in models.items():
    print(f"\n===== {name} =====")
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)

    print(classification_report(y_test, preds, digits=2, zero_division=0))
    


===== Decision Tree =====
              precision    recall  f1-score   support

           0       0.91      0.91      0.91       265
           1       1.00      1.00      1.00       104
           2       0.95      0.92      0.94       326
           3       0.89      0.92      0.90       709
           4       0.95      0.95      0.95       386
           5       0.92      0.95      0.93       406
           6       0.86      0.82      0.84       527

    accuracy                           0.91      2723
   macro avg       0.92      0.92      0.92      2723
weighted avg       0.91      0.91      0.91      2723


===== Random Forest =====
              precision    recall  f1-score   support

           0       0.94      0.89      0.92       265
           1       1.00      1.00      1.00       104
           2       0.94      0.94      0.94       326
           3       0.91      0.92      0.91       709
           4       0.96      0.95      0.96       386
           5       0.94 

In [159]:
comparison_df = pd.DataFrame(results).T
comparison_df = comparison_df.sort_values(by="accuracy", ascending=False)

display(comparison_df.style.format({
    "accuracy": "{:.4f}",
    "precision": "{:.4f}",
    "recall": "{:.4f}",
    "f1": "{:.4f}"
}))

best_model_name = comparison_df.index[0]
print("\nBest model:", best_model_name)
print("Best accuracy:", comparison_df.loc[best_model_name, "accuracy"])

Unnamed: 0,accuracy,precision,recall,f1
Random Forest,0.9221,0.9222,0.9221,0.9221
KNN,0.9203,0.9212,0.9203,0.9205
Decision Tree,0.9097,0.9095,0.9097,0.9093



Best model: Random Forest
Best accuracy: 0.9221446933529196
