In [1]:
# Mounting the google drive to get the images

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
# Importing some useful libraries and scripts

import pandas as pd # For dataframe related tasks
import numpy as np # For numerical tasks

# For train test spliting while training our model
from sklearn.model_selection import train_test_split

import pickle # For model saving

# A module for model training
from sklearn.linear_model import LogisticRegression

# A module for accuracy score checking
from sklearn.metrics import accuracy_score

In [3]:
# Specifying the location for all our data is located

data_folder_path = 'drive/MyDrive/AI_project_data'

In [4]:
# Checking our data
# header = None is done because our data has no column names

df = pd.read_csv(data_folder_path + '/sonar_data.csv',header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [5]:
# Checking our data skewness
# From the result we can see that we have a balanced dataset

df[60].value_counts()

M    111
R     97
Name: 60, dtype: int64

In [6]:
# Lets group our data based on our target value

df.groupby(60).mean()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
60,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M,0.034989,0.045544,0.05072,0.064768,0.086715,0.111864,0.128359,0.149832,0.213492,0.251022,...,0.019352,0.016014,0.011643,0.012185,0.009923,0.008914,0.007825,0.00906,0.008695,0.00693
R,0.022498,0.030303,0.035951,0.041447,0.062028,0.096224,0.11418,0.117596,0.137392,0.159325,...,0.012311,0.010453,0.00964,0.009518,0.008567,0.00743,0.007814,0.006677,0.007078,0.006024


In [7]:
# Checking if there is any missing value in the dataset

df.isna().sum().sum()

0

In [8]:
# Lets choose our feature and target variables

X = df.drop(60,axis = 1)
X = np.array(X)
y = np.array(df[60])
print(X.shape)
print(y.shape)

(208, 60)
(208,)


In [9]:
'''
 Train test splitting using (80,20) ratio
 We used the argument stratify to have a standardized train test split
 i.e. the train set will have a balanced M and R ratio
'''

X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                 test_size = 0.2,
                                                 stratify = y,
                                                 random_state = 42
                                                 )

In [10]:
# Instantiating and training the logistic regression model

lr = LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression()

In [11]:
# Checking the training set accuracy 

pred = lr.predict(X_train)
accuracy = round(accuracy_score(pred,y_train),2)
print(f'Accuracy of the training set is: {int(accuracy * 100)}%')

Accuracy of the training set is: 84%


In [12]:
# Checking the test set accuracy to check if there is any overfitting

pred = lr.predict(X_test)
accuracy = round(accuracy_score(pred,y_test),2)
print(f'Accuracy of the test set is: {int(accuracy * 100)}%')

Accuracy of the test set is: 86%


In [13]:
# Lets take some values in the test set and predict

check_data = X_test[:5]
prediction = lr.predict(check_data)

In [14]:
# Checking the accuracy of our sampled data

accurate_results = np.array(prediction == y_test[:5],dtype=int)
print('The model accurately predicted {} out of {}'.format(
    np.sum(accurate_results),
    len(accurate_results)
))

The model accurately predicted 4 out of 5


In [17]:
# Saving the model 

file_name = 'models/rock_vs_mine_model.h5'
with open(file_name,'wb') as f:
  pickle.dump(lr,f)