In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
# Load the dataset
sonar_data = pd.read_csv('C:/Users/ADMIN/Downloads/sonar_data.csv', header=None)

In [3]:
# Display the first few rows
print(sonar_data.head())


       0       1       2       3       4       5       6       7       8   \
0  0.0200  0.0371  0.0428  0.0207  0.0954  0.0986  0.1539  0.1601  0.3109   
1  0.0453  0.0523  0.0843  0.0689  0.1183  0.2583  0.2156  0.3481  0.3337   
2  0.0262  0.0582  0.1099  0.1083  0.0974  0.2280  0.2431  0.3771  0.5598   
3  0.0100  0.0171  0.0623  0.0205  0.0205  0.0368  0.1098  0.1276  0.0598   
4  0.0762  0.0666  0.0481  0.0394  0.0590  0.0649  0.1209  0.2467  0.3564   

       9   ...      51      52      53      54      55      56      57  \
0  0.2111  ...  0.0027  0.0065  0.0159  0.0072  0.0167  0.0180  0.0084   
1  0.2872  ...  0.0084  0.0089  0.0048  0.0094  0.0191  0.0140  0.0049   
2  0.6194  ...  0.0232  0.0166  0.0095  0.0180  0.0244  0.0316  0.0164   
3  0.1264  ...  0.0121  0.0036  0.0150  0.0085  0.0073  0.0050  0.0044   
4  0.4459  ...  0.0031  0.0054  0.0105  0.0110  0.0015  0.0072  0.0048   

       58      59  60  
0  0.0090  0.0032   R  
1  0.0052  0.0044   R  
2  0.0095  0.0078   

In [4]:
# Number of rows and columns
print(f"Dataset shape: {sonar_data.shape}")

Dataset shape: (208, 61)


In [5]:
# Statistical measures of the data
print(sonar_data.describe())

               0           1           2           3           4           5   \
count  208.000000  208.000000  208.000000  208.000000  208.000000  208.000000   
mean     0.029164    0.038437    0.043832    0.053892    0.075202    0.104570   
std      0.022991    0.032960    0.038428    0.046528    0.055552    0.059105   
min      0.001500    0.000600    0.001500    0.005800    0.006700    0.010200   
25%      0.013350    0.016450    0.018950    0.024375    0.038050    0.067025   
50%      0.022800    0.030800    0.034300    0.044050    0.062500    0.092150   
75%      0.035550    0.047950    0.057950    0.064500    0.100275    0.134125   
max      0.137100    0.233900    0.305900    0.426400    0.401000    0.382300   

               6           7           8           9   ...          50  \
count  208.000000  208.000000  208.000000  208.000000  ...  208.000000   
mean     0.121747    0.134799    0.178003    0.208259  ...    0.016069   
std      0.061788    0.085152    0.118387    0.1

In [6]:
# Count the values of the target column
print(sonar_data[60].value_counts())

60
M    111
R     97
Name: count, dtype: int64


In [7]:
# Grouping the data by the target column and finding the mean
print(sonar_data.groupby(60).mean())

          0         1         2         3         4         5         6   \
60                                                                         
M   0.034989  0.045544  0.050720  0.064768  0.086715  0.111864  0.128359   
R   0.022498  0.030303  0.035951  0.041447  0.062028  0.096224  0.114180   

          7         8         9   ...        50        51        52        53  \
60                                ...                                           
M   0.149832  0.213492  0.251022  ...  0.019352  0.016014  0.011643  0.012185   
R   0.117596  0.137392  0.159325  ...  0.012311  0.010453  0.009640  0.009518   

          54        55        56        57        58        59  
60                                                              
M   0.009923  0.008914  0.007825  0.009060  0.008695  0.006930  
R   0.008567  0.007430  0.007814  0.006677  0.007078  0.006024  

[2 rows x 60 columns]


In [8]:
# Separate the data and labels
X = sonar_data.drop(columns=60, axis=1)
Y = sonar_data[60]

In [9]:
# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state=1)

In [10]:
# Feature scaling using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define the updated parameter grid with supported solvers for penalties
param_grid = {
    'penalty': ['l1', 'l2'],              # l1 is not supported for saga, so ensure compatible solvers
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],      # saga supports l2 and elasticnet
    'max_iter': [5000],                   # Increase max_iter only if needed (start with a reasonable value)
    'class_weight': ['balanced']          # Included in grid search in case you'd like to tune it
}

# GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    LogisticRegression(random_state=42),  # Add random_state for reproducibility
    param_grid, 
    cv=5, 
    scoring='accuracy'
)

# Fit the grid search to the data
grid_search.fit(X_train_scaled, Y_train)

# Display the best parameters
print(f"Best parameters: {grid_search.best_params_}")


Best parameters: {'C': 0.1, 'class_weight': 'balanced', 'max_iter': 5000, 'penalty': 'l2', 'solver': 'saga'}


In [18]:
# Best model
best_logistic_model = grid_search.best_estimator_

In [19]:
# Training and test accuracy with tuned Logistic Regression model
train_accuracy = best_logistic_model.score(X_train_scaled, Y_train)
test_accuracy = best_logistic_model.score(X_test_scaled, Y_test)
print(f"Logistic Regression - Training accuracy: {train_accuracy:.4f}")
print(f"Logistic Regression - Test accuracy: {test_accuracy:.4f}")

Logistic Regression - Training accuracy: 0.8824
Logistic Regression - Test accuracy: 0.7143


In [20]:
# Step 2: Trying different models (Random Forest and SVM)
# Random Forest Classifier
rf_model = RandomForestClassifier(class_weight='balanced', random_state=1)
rf_model.fit(X_train_scaled, Y_train)
rf_train_accuracy = rf_model.score(X_train_scaled, Y_train)
rf_test_accuracy = rf_model.score(X_test_scaled, Y_test)
print(f"Random Forest - Training accuracy: {rf_train_accuracy:.4f}")
print(f"Random Forest - Test accuracy: {rf_test_accuracy:.4f}")

Random Forest - Training accuracy: 1.0000
Random Forest - Test accuracy: 0.7619


In [21]:
# Support Vector Machine (SVM)
svm_model = SVC(class_weight='balanced', random_state=1)
svm_model.fit(X_train_scaled, Y_train)
svm_train_accuracy = svm_model.score(X_train_scaled, Y_train)
svm_test_accuracy = svm_model.score(X_test_scaled, Y_test)
print(f"SVM - Training accuracy: {svm_train_accuracy:.4f}")
print(f"SVM - Test accuracy: {svm_test_accuracy:.4f}")

SVM - Training accuracy: 0.9679
SVM - Test accuracy: 0.9048


In [22]:
# Step 3: Cross-validation for best Logistic Regression model
cv_scores = cross_val_score(best_logistic_model, X_train_scaled, Y_train, cv=5)
print(f"Logistic Regression - Cross-validation scores: {cv_scores}")
print(f"Average CV score: {np.mean(cv_scores):.4f}")

Logistic Regression - Cross-validation scores: [0.78947368 0.78947368 0.75675676 0.78378378 0.86486486]
Average CV score: 0.7969


In [23]:
# Classification report for Logistic Regression
Y_test_pred = best_logistic_model.predict(X_test_scaled)
print("\nLogistic Regression - Classification Report:")
print(classification_report(Y_test, Y_test_pred))


Logistic Regression - Classification Report:
              precision    recall  f1-score   support

           M       0.73      0.73      0.73        11
           R       0.70      0.70      0.70        10

    accuracy                           0.71        21
   macro avg       0.71      0.71      0.71        21
weighted avg       0.71      0.71      0.71        21



In [24]:

# Step 5: Making a prediction for a single input instance
input_data = (0.0307, 0.0523, 0.0653, 0.0521, 0.0611, 0.0577, 0.0665, 0.0664, 
              0.1460, 0.2792, 0.3877, 0.4992, 0.4981, 0.4972, 0.5607, 0.7339, 
              0.8230, 0.9173, 0.9975, 0.9911, 0.8240, 0.6498, 0.5980, 0.4862, 
              0.3150, 0.1543, 0.0989, 0.0284, 0.1008, 0.2636, 0.2694, 0.2930, 
              0.2925, 0.3998, 0.3660, 0.3172, 0.4609, 0.4374, 0.1820, 0.3376, 
              0.6202, 0.4448, 0.1863, 0.1420, 0.0589, 0.0576, 0.0672, 0.0269, 
              0.0245, 0.0190, 0.0063, 0.0321, 0.0189, 0.0137, 0.0277, 0.0152, 
              0.0052, 0.0121, 0.0124, 0.0055)

# Convert input data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape and scale the input data
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)
input_data_scaled = scaler.transform(input_data_reshaped)

# Make a prediction using the best Logistic Regression model
prediction = best_logistic_model.predict(input_data_scaled)

# Display prediction result
if prediction[0] == 'R':
    print('The object is a Rock')
else:
    print('The object is a Mine')

The object is a Mine


In [25]:
joblib.dump(svm_model, 'best_svm_model.pkl')

['best_svm_model.pkl']

In [26]:
joblib.dump(scaler, 'scaler.pkl')  # Save the scaler

['scaler.pkl']