In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
df = pd.read_csv("sonar data.csv", header=None)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [3]:
df.shape


(208, 61)

In [4]:
df.iloc[:, -1].value_counts()


Unnamed: 0_level_0,count
60,Unnamed: 1_level_1
M,111
R,97


In [5]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [7]:
pipe_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

pipe_lr.fit(X_train, y_train)

y_pred_lr = pipe_lr.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Accuracy: 0.8571428571428571
              precision    recall  f1-score   support

           M       0.81      0.95      0.88        22
           R       0.94      0.75      0.83        20

    accuracy                           0.86        42
   macro avg       0.87      0.85      0.85        42
weighted avg       0.87      0.86      0.86        42



In [8]:
pipe_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

param_grid_knn = {
    "knn__n_neighbors": [3,5,7,9,11],
    "knn__weights": ["uniform", "distance"]
}

grid_knn = GridSearchCV(pipe_knn, param_grid_knn, cv=5)
grid_knn.fit(X_train, y_train)

print("Best CV Score:", grid_knn.best_score_)

Best CV Score: 0.8377896613190732


In [9]:
y_pred_knn = grid_knn.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

Test Accuracy: 0.7142857142857143
              precision    recall  f1-score   support

           M       0.71      0.77      0.74        22
           R       0.72      0.65      0.68        20

    accuracy                           0.71        42
   macro avg       0.72      0.71      0.71        42
weighted avg       0.71      0.71      0.71        42



In [10]:
pipe_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel="rbf", class_weight="balanced"))
])

pipe_svm.fit(X_train, y_train)

y_pred_svm = pipe_svm.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

Accuracy: 0.8571428571428571
              precision    recall  f1-score   support

           M       0.81      0.95      0.88        22
           R       0.94      0.75      0.83        20

    accuracy                           0.86        42
   macro avg       0.87      0.85      0.85        42
weighted avg       0.87      0.86      0.86        42



In [11]:
nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

Accuracy: 0.6428571428571429
              precision    recall  f1-score   support

           M       0.71      0.55      0.62        22
           R       0.60      0.75      0.67        20

    accuracy                           0.64        42
   macro avg       0.65      0.65      0.64        42
weighted avg       0.66      0.64      0.64        42



In [12]:
dt = DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=10,
    random_state=42
)

dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

Accuracy: 0.7142857142857143
              precision    recall  f1-score   support

           M       0.69      0.82      0.75        22
           R       0.75      0.60      0.67        20

    accuracy                           0.71        42
   macro avg       0.72      0.71      0.71        42
weighted avg       0.72      0.71      0.71        42



In [13]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    class_weight="balanced",
    random_state=42
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Accuracy: 0.8571428571428571
              precision    recall  f1-score   support

           M       0.79      1.00      0.88        22
           R       1.00      0.70      0.82        20

    accuracy                           0.86        42
   macro avg       0.89      0.85      0.85        42
weighted avg       0.89      0.86      0.85        42



In [14]:
results = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "KNN",
        "SVM",
        "Naive Bayes",
        "Decision Tree",
        "Random Forest"
    ],
    "Accuracy": [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_knn),
        accuracy_score(y_test, y_pred_svm),
        accuracy_score(y_test, y_pred_nb),
        accuracy_score(y_test, y_pred_dt),
        accuracy_score(y_test, y_pred_rf)
    ]
})

results

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.857143
1,KNN,0.714286
2,SVM,0.857143
3,Naive Bayes,0.642857
4,Decision Tree,0.714286
5,Random Forest,0.857143


In [15]:
print("""
Conclusion:
SVM achieved the highest and most stable accuracy due to its effectiveness on
small, high-dimensional datasets. Logistic Regression provided a strong baseline.
KNN showed high accuracy but was sensitive to parameter choice.
Tree-based models tended to overfit, while Naive Bayes underperformed due to
violated independence assumptions.
""")


Conclusion:
SVM achieved the highest and most stable accuracy due to its effectiveness on
small, high-dimensional datasets. Logistic Regression provided a strong baseline.
KNN showed high accuracy but was sensitive to parameter choice.
Tree-based models tended to overfit, while Naive Bayes underperformed due to
violated independence assumptions.

