Import the dependencies used in the project

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split        # to split the train and test data
from sklearn.linear_model import LogisticRegression         # the logistic regression model
from sklearn.metrics import accuracy_score                  # to check the accuracy score of the model

Data collection and data processing

In [None]:
# loading the dataset to pandas dataframe
sonar_data = pd.read_csv("./sonar_data.csv", header=None)   # this file has no header

In [None]:
sonar_data.head()                                           # to display the first 5 rows of the dataset
# get the number of rows and columns
(rows, columns) = sonar_data.shape
sonar_data.describe()                                       # gives the standard deviation of the data and other measures of the data

In [None]:
sonar_data[60].value_counts()                               # gives the number of elements for value in the last column

if we have a big difference between the number of values we can have, the dataset has "biases"

In [None]:
# group the values by mine and rock and calculate the mean value for each column
sonar_data.groupby(60).mean()

In [None]:
# separate the data and the labels
datas = sonar_data.drop(columns=60, axis=0)                 # getting the data values (from column 0 to 59)
labels = sonar_data[60]                                     # getting the label values (from column 60)

Now split the data between training and test

In [None]:
# use this sklearn function to split data and labels between test and train
datas_train, datas_test, labels_train, labels_test = train_test_split(datas, labels, test_size=0.1, stratify=labels, random_state=1)
# PARAMETERS:
#   1, 2: split the data and label columns
#   3: only 10% of the dataset is used as test
#   4: we want to give the train and test sets the same "percentage" of the labels values, so that there are no biases in the train or test datasets
#   5: order the datas in a random way

In [None]:
print(f"Data shape: {datas.shape}     Labels shape: {labels.shape}\nData and label train shape: {datas_train.shape}     Data and label test shape: {datas_test.shape}")

Model training:
    We use the logistic regression model

In [None]:
model = LogisticRegression()

In [None]:
# train the logistic regression model with the training data
model.fit(datas_train, labels_train)

Model evaluation

In [None]:
# get the accuracy of the model on the training data
train_prediction = model.predict(datas_train)
training_accuracy = accuracy_score(train_prediction, labels_train)
print(f"Training accuracy score: {train_prediction}")

In [None]:
# get the accuracy of the model on the test data
test_prediction = model.predict(datas_test)
test_accuracy = accuracy_score(test_prediction, labels_test)
print(f"Test accuracy score: {train_prediction}")

Making a prediction system

In [None]:
input_data = ()
# use a numpy array
numpy_input = np.asarray(input_data)

# reshape the numpy array as we predict one instance
input_data_reshape = numpy_input.reshape(1, -1)
prediction = model.predict(input_data_reshape)
print(prediction)