In [None]:
!pip install snowflake-connector-python

In [None]:
import snowflake.connector
import pandas as pd
con = snowflake.connector.connect(
    user='xxxx',
    password='xxxx',
    account='xxxx',
    warehouse='COMPUTE_WH',
    database='CREDITCARDFRAUD',
    schema='PUBLIC'
)

In [None]:
query='select * from "CREDITFRAUD"'
df=pd.read_sql(query,con)
con.close()
print(df.head())

In [None]:
df.info()

In [None]:
df['CLASS'].value_counts()



**Extremely imbalanced**

In [None]:
import matplotlib.pyplot as plt

df['CLASS'].value_counts().plot(kind='bar')
plt.title("Class Distribution (Normal vs Fraud)")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

In [None]:
df.describe().T


**Conditional Independence assumption of Naive Bayes is violated**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 10))
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm', linewidths=0.1)
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('CLASS', axis=1)
y = df['CLASS']

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred_nb = gnb.predict(X_test)
y_prob_nb = gnb.predict_proba(X_test)[:, 1]


In [None]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_nb))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_nb))


**LogisticRegression shows convergence issues under class imbalance and feature scale differences**

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
y_prob_lr = lr.predict_proba(X_test)[:, 1]

print("\nLogistic Regression Report:\n", classification_report(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_lr))


**Although KNN doesnot make independence assumptions , it struggles under class imbalance and local density domination of normal transactions**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

print("KNN Classification Report:\n", classification_report(y_test, y_pred_knn))


**Naive Bayes in Small Data**

In [None]:
fractions = [0.1, 0.3, 0.5, 0.9]

nb_recalls = []
lr_recalls = []

for frac in fractions:
    nb_report = evaluate_on_fraction(GaussianNB(), X_train, y_train, frac)
    lr_report = evaluate_on_fraction(LogisticRegression(max_iter=1000), X_train, y_train, frac)

    nb_recalls.append(nb_report['1']['recall'])
    lr_recalls.append(lr_report['1']['recall'])

**As training data increases, Naive Bayes shows stable recall for fraud detection, whereas Logistic Regression increasingly favors the majority class**

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.plot(fractions, nb_recalls, marker='o', label='Naive Bayes Recall')
plt.plot(fractions, lr_recalls, marker='o', label='Logistic Regression Recall')
plt.xlabel("Training Data Fraction")
plt.ylabel("Fraud Recall")
plt.title("Minority Class Recall vs Training Data Size")
plt.legend()
plt.show()

This research shows that while it may be acceptable to violate the model assumptions, it is often worse if you ignore important information about the characteristics of  data including class imbalance and sample size.

While the features have strong correlations, the Naive Bayes classifier captures most of the fraud patterns because of its probabilistic framework with low-variance behaviour.

On the other hand, Flexibility is a relative term when it comes to discriminative models, as they will require more careful tuning and more extensive datasets to prevent suppression of minority class signals.