<a href="https://www.kaggle.com/code/mohammedmohsen0404/bank-note-authentication-uci?scriptVersionId=188636186" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

---
**<center><h1>Bank Note Authentication UCI</h1></center>**
<center><h3>Learning ML, DL through 100 Practical Projects</h3></center>

---

This dataset originates from images of genuine and forged banknotes, captured using an industrial camera typically used for print inspection. The images have a resolution of approximately 660 dpi and are grayscale with dimensions of 400x400 pixels. Features extracted using Wavelet Transform from these images enable binary classification tasks. The objective is to develop machine learning models that accurately distinguish between genuine and forged banknotes based on these extracted features.

# **Import Libraries and Data**
---

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download ritesaluja/bank-note-authentication-uci-data -f  BankNote_Authentication.csv

In [None]:
D = pd.read_csv('BankNote_Authentication.csv')
data = D.copy()

# **Take a look at the data**
---

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data['class'].value_counts()

In [None]:
data.info()

In [None]:
data.describe()

# **Exploratory Data Analysis**
---

**Univariate Analysis**

In [None]:
numerical_data = data.select_dtypes(include='number')
numerical_data.hist(figsize=(10, 8),color = 'b')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.boxplot(numerical_data)
plt.show()

In [None]:
categorical_data = data.select_dtypes(include='object')
for column in categorical_data.columns:
    sns.countplot(data=categorical_data, x=column, palette="Set1")
    plt.title(f"Countplot of {column}")
    plt.show()

**Multivariate Analysis**

In [None]:
sns.pairplot(data.select_dtypes(include='number'))
plt.show()

In [None]:
sns.heatmap(numerical_data.corr(), annot=True, cmap='coolwarm')
plt.show()

# **Data Cleaning**
---

**Handling Duplicate Rows**

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates()

**Handling Missing Data**

In [None]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

In [None]:
total = data.isnull().sum().sum()
print('Total Null values =' ,total)

# **Data Preprocessing**
---

**Outliers**

In [None]:
# Assuming data is a pandas DataFrame
from scipy import stats
z_scores = np.abs(stats.zscore(data))
outliers = data[(z_scores > 3).any(axis=1)]

print("Outliers using Z-score method:")
print(outliers)

In [None]:
import matplotlib.pyplot as plt

# Boxplot
plt.figure(figsize=(10, 6))
plt.boxplot(data)
plt.title('Boxplot for Outlier Detection')
plt.show()

# Scatter plot
plt.figure(figsize=(10, 6))
for column in data.columns:
    plt.scatter(range(len(data)), data[column], label=column)
plt.title('Scatter Plot for Outlier Detection')
plt.legend()
plt.show()


In [None]:
log_data = np.log(data+1)

**Data splitting**

In [None]:
X = data.drop('class' , axis = 1)
y = data['class']

In [None]:
X_train  , X_test , y_train, y_test = train_test_split(X,y , train_size=.4 , shuffle = True ,random_state=101)
X_test , X_val , y_test , y_val = train_test_split(X_test,y_test , train_size=.2 ,random_state=101)

**Data Normalization**

In [None]:
plt.hist (X_train)

In [None]:
from sklearn.preprocessing import StandardScaler
# Create a scaler, fit and transform the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

# **Traditional models**
---

In [None]:
classifiers = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Support Vector Machine', SVC(random_state=42)),
    ('Naive Bayes', GaussianNB()),
    ('AdaBoost', AdaBoostClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

In [None]:
from sklearn.model_selection import cross_val_score

for clf_name, clf in classifiers:
    scores = cross_val_score(clf, X, y, cv=5)  # 5-fold cross-validation
    print(f'{clf_name}: Mean accuracy = {scores.mean():.2f}')


In [None]:
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

# Assuming X and y are your features and target variable respectively
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

# List of classifiers
classifiers = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Support Vector Machine', SVC(random_state=42)),
    ('Naive Bayes', GaussianNB()),
    ('AdaBoost', AdaBoostClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

# Evaluate each classifier
for clf_name, clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')  # Using weighted average for multi-class classification
    print(f'{clf_name}: F1 Score = {f1:.2f}')
    print(f'{clf_name} Classification Report:\n{classification_report(y_test, y_pred)}')
    print('---------------------------------------------------')


# **Deep Neural Network**

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense , Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.activations import *
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [None]:
model = keras.Sequential()
model.add(Dense(units =64 ,activation='relu',input_dim =4))
model.add(Dense(units=28,activation='relu'))
model.add(Dense(units=8,activation='relu'))
model.add(Dense(units=1,activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

early_stoping = keras.callbacks.EarlyStopping(patience=20,
                                              min_delta=0.001,
                                              restore_best_weights=True,
                                             )

In [None]:
note = model.fit(X_train,y_train,batch_size=24,epochs=100,validation_split=0.25,callbacks=[early_stoping],verbose=0)

In [None]:
# plots for loss and accuracy of train data

loss = pd.DataFrame(note.history)

# loss for model with epochs
loss.loc[:,['loss','val_loss']].plot(figsize=(20,6))
plt.title('Model Loss',fontsize=15)
plt.xlabel('Epochs',fontsize=12)
plt.ylabel('Loss',fontsize=12)
plt.show()

In [None]:
# Accuracy plot for model
loss.loc[:,['accuracy','val_accuracy']].plot(figsize=(20,6))
plt.title('Model Accuracy',fontsize=15)
plt.xlabel('Epochs',fontsize=12)
plt.ylabel('Accuracy',fontsize=12)
plt.show()

In [None]:
# create predictions from X_test
pred = model.predict(X_test)

# pred if >0.5 =1 else 0
pred = np.where(pred>0.5,1,0)

In [None]:
print(classification_report(y_test,pred))


In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test,pred)
print(cm)

# **Cluster**

In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

algorithms = [
    ('KMeans', KMeans(n_clusters=3, random_state=33)),
    ('Agglomerative', AgglomerativeClustering(n_clusters=3))
]

for name, algorithm in algorithms:
    y_pred = algorithm.fit_predict(X_train)

    silhouette_avg = silhouette_score(X_train, y_pred)
    davies_bouldin = davies_bouldin_score(X_train, y_pred)
    calinski_harabasz = calinski_harabasz_score(X_train, y_pred)

    print(f"Algorithm: {name}")
    print(f"Silhouette Score: {silhouette_avg:.4f}")
    print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")
    print(f"Calinski-Harabasz Index: {calinski_harabasz:.4f}")
    print()


In [None]:
from sklearn.cluster import KMeans
ilist = []
n = 8
for i in range(1,n):
    kmeans =  KMeans(n_clusters=i,init='k-means++', #also can be random
                     random_state=33,algorithm= 'lloyd' , n_init= 3) # also can be full or elkan
    kmeans.fit(X)
    ilist.append(kmeans.inertia_)

plt.plot(range(1,n), ilist)
plt.title('Elbow')
plt.xlabel('clusters')
plt.ylabel('inertias')
plt.show()
