In [1]:
#importing required libraries
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
#loading the dataset
df = pd.read_csv('Data/BankNote_Authentication.csv')
df.head()

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [3]:
#number of rows and columns
df.shape

(1372, 5)

In [4]:
#features that are having null/nan values
[feature for feature in df.columns if df[feature].isnull().sum()>0]

[]

In [5]:
#features that are having null/nan values in count
df.isnull().sum()

variance    0
skewness    0
curtosis    0
entropy     0
class       0
dtype: int64

In [6]:
#data type of features
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1372 entries, 0 to 1371
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   variance  1372 non-null   float64
 1   skewness  1372 non-null   float64
 2   curtosis  1372 non-null   float64
 3   entropy   1372 non-null   float64
 4   class     1372 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 53.7 KB


In [7]:
#information of numerical features
df.describe()

Unnamed: 0,variance,skewness,curtosis,entropy,class
count,1372.0,1372.0,1372.0,1372.0,1372.0
mean,0.433735,1.922353,1.397627,-1.191657,0.444606
std,2.842763,5.869047,4.31003,2.101013,0.497103
min,-7.0421,-13.7731,-5.2861,-8.5482,0.0
25%,-1.773,-1.7082,-1.574975,-2.41345,0.0
50%,0.49618,2.31965,0.61663,-0.58665,0.0
75%,2.821475,6.814625,3.17925,0.39481,1.0
max,6.8248,12.9516,17.9274,2.4495,1.0


In [8]:
#distribution of the data in target variable
df['class'].value_counts()

0    762
1    610
Name: class, dtype: int64

In [9]:
#Independent variables
X = df.iloc[:,:-1]
#dependent variable
y = df.iloc[:,-1]

In [10]:
#first five records of the independent variables dataframe 
X.head()

Unnamed: 0,variance,skewness,curtosis,entropy
0,3.6216,8.6661,-2.8073,-0.44699
1,4.5459,8.1674,-2.4586,-1.4621
2,3.866,-2.6383,1.9242,0.10645
3,3.4566,9.5228,-4.0112,-3.5944
4,0.32924,-4.4552,4.5718,-0.9888


In [11]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64

In [12]:
#dividing the dataset into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=df['class'])

In [13]:
#library required for model implementation
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
#training the model
clf.fit(X_train, y_train)

LogisticRegression()

In [14]:
#predicting the variables
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

In [15]:
#libraries required for model evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [16]:
train_clf_report = classification_report(y_train, pred_train)
test_clf_report = classification_report(y_test, pred_test)
#classification report of training set
print(train_clf_report)

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       609
           1       0.99      1.00      0.99       488

    accuracy                           0.99      1097
   macro avg       0.99      0.99      0.99      1097
weighted avg       0.99      0.99      0.99      1097



In [17]:
#classification report of test set
print(test_clf_report)

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       153
           1       0.98      0.98      0.98       122

    accuracy                           0.98       275
   macro avg       0.98      0.98      0.98       275
weighted avg       0.98      0.98      0.98       275



In [18]:
#creating a pickle file using serialization
import pickle

pickle_out = open('clf.pkl', 'wb')
#dumping the model data
pickle.dump(clf, pickle_out)
pickle_out.close()

In [19]:
#saving the test dataset into csv file for testing the model with pickle file
X_test.to_csv('test.csv', index=None)
test_df = pd.read_csv('test.csv')
#first five records of the test set
test_df.head()

Unnamed: 0,variance,skewness,curtosis,entropy
0,3.583,-3.7971,3.4391,-0.12501
1,-3.8203,-13.0551,16.9583,-2.3052
2,4.1757,10.2615,-3.8552,-4.3056
3,-5.525,6.3258,0.89768,-6.6241
4,-0.49948,1.7734,-2.2469,-0.68104
