In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay, roc_auc_score, roc_curve
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px

In [3]:
df = pd.read_csv("bill_authentication.csv.xls")
df

Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.62160,8.66610,-2.8073,-0.44699,0
1,4.54590,8.16740,-2.4586,-1.46210,0
2,3.86600,-2.63830,1.9242,0.10645,0
3,3.45660,9.52280,-4.0112,-3.59440,0
4,0.32924,-4.45520,4.5718,-0.98880,0
...,...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949,1
1368,-1.38870,-4.87730,6.4774,0.34179,1
1369,-3.75030,-13.45860,17.5932,-2.77710,1
1370,-3.56370,-8.38270,12.3930,-1.28230,1


In [4]:
# Display statistical summary
print(df.describe())


          Variance     Skewness     Curtosis      Entropy        Class
count  1372.000000  1372.000000  1372.000000  1372.000000  1372.000000
mean      0.433735     1.922353     1.397627    -1.191657     0.444606
std       2.842763     5.869047     4.310030     2.101013     0.497103
min      -7.042100   -13.773100    -5.286100    -8.548200     0.000000
25%      -1.773000    -1.708200    -1.574975    -2.413450     0.000000
50%       0.496180     2.319650     0.616630    -0.586650     0.000000
75%       2.821475     6.814625     3.179250     0.394810     1.000000
max       6.824800    12.951600    17.927400     2.449500     1.000000


In [5]:
# Check for missing values
print(df.isna().sum())

Variance    0
Skewness    0
Curtosis    0
Entropy     0
Class       0
dtype: int64


In [6]:
# Split features and labels
X = df.drop('Class', axis=1)
y = df['Class']

In [7]:
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.8, random_state=13)



In [9]:
# Linear Regression Model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_lr = linear_model.predict(X_test)
accuracy_lr = np.mean((y_pred_lr > 0.5).astype(int) == y_test) * 100
print(f"Linear Regression Accuracy: {accuracy_lr:.2f}%")

Linear Regression Accuracy: 97.09%


In [10]:
# SGDClassifier Model
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
y_pred_sgd = sgd_model.predict(X_test)
accuracy_sgd = accuracy_score(y_test, y_pred_sgd) * 100
print(f"SGDClassifier Accuracy: {accuracy_sgd:.2f}%")

SGDClassifier Accuracy: 99.64%


In [11]:
# GaussianNB Model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb) * 100
print(f"GaussianNB Accuracy: {accuracy_nb:.2f}%")

GaussianNB Accuracy: 85.82%


In [12]:
# Decision Tree Model
dt_model = DecisionTreeClassifier()

In [13]:
# Hyperparameter Tuning using GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'auto', 'sqrt', 'log2']
}


In [None]:
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)



Fitting 5 folds for each of 432 candidates, totalling 2160 fits


In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

In [None]:
# Re-train model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)



In [None]:
# Tree plotting
plt.figure(figsize=(40,30))
plot_tree(
    best_model,
    feature_names=['Variance','Skewness', 'Curtosis', 'Entropy'], 
    class_names=['Not authentic','Authentic'],
    filled=True
)
plt.show()



In [None]:
# Model prediction
y_pred_dt = best_model.predict(X_test)

In [None]:
# Model evaluation
acc_dt = accuracy_score(y_test, y_pred_dt)
cm_dt = confusion_matrix(y_test, y_pred_dt)
cm_display_dt = ConfusionMatrixDisplay(confusion_matrix=cm_dt)
roc_auc_dt = roc_auc_score(y_test, y_pred_dt)
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_dt)
print('Decision Tree Accuracy:', acc_dt)
print(classification_report(y_test, y_pred_dt))
print('ROC AUC Score:', roc_auc_dt)



In [None]:
# Confusion Matrix Display
cm_display_dt.plot()
plt.show()



In [None]:
# ROC Curve
plt.figure()
plt.plot(fpr_dt, tpr_dt, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()



In [None]:
# SVC Model
svc_model = SVC(probability=True)
svc_model.fit(X_train, y_train)
y_pred_svc = svc_model.predict(X_test)
accuracy_svc = accuracy_score(y_test, y_pred_svc) * 100
print(f"SVC Accuracy: {accuracy_svc:.2f}%")


In [None]:
# Random Forest Classifier Model
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
y_pred_rfc = rfc_model.predict(X_test)
accuracy_rfc = accuracy_score(y_test, y_pred_rfc) * 100
print(f"Random Forest Classifier Accuracy: {accuracy_rfc:.2f}%")

In [None]:
# Additional Visualization using Plotly
# Feature Distribution
fig = px.histogram(df, x='Class', text_auto=True, color='Class', title='Class Distribution').update_xaxes()
fig.show()


In [None]:

# Pairplot of Features
sns.pairplot(df, hue='Class')
plt.show()

In [None]:

# Feature Correlation Heatmap
plt.figure(figsize=(14, 12))  # Increase the figure size further
corr_matrix = df.corr()
heatmap = sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.title('Feature Correlation Heatmap')
plt.tight_layout()  # Use a tighter layout to make the most of the space available
plt.show()