In [None]:
import numpy as np
import imblearn as imb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from lightgbm import LGBMClassifier

# Set some visualization styles
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
df = pd.read_csv("creditcard.csv")

In [None]:
df.head()

In [None]:
pd.options.display.max_columns = None

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum().sum()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()
df['Amount'] = sc.fit_transform(pd.DataFrame(df['Amount']))

In [None]:
df = df.drop(['Time'], axis = 1)

In [None]:
df.head()

In [None]:
df.duplicated().any()

In [None]:
df = df.drop_duplicates()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
# Check the distribution of the target variable 'Class'
df['Class'].value_counts()

In [None]:
df.head()

In [None]:
# Data is very Imbalanced.
# Let's visualize it
class_counts = df['Class'].value_counts()
plt.figure(figsize=(10, 6))
plt.bar(class_counts.index, class_counts.values)
plt.title('Count of Classes (Matplotlib Bar Plot)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

In [None]:
# class_counts = df1['Class'].value_counts()
# plt.figure(figsize=(8, 8))
# plt.pie(class_counts, labels=class_counts.index, autopct='%1.2f%%', startangle=90)
# plt.title('Distribution of Classes (Pie Chart)')
# plt.show()

In [None]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [None]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 42)

In [None]:
len(df_train), len(df_val), len(df_test)

In [None]:
X = df.drop('Class', axis = 1)
X_columns = X.columns.tolist()
y = df['Class']

In [None]:
train_dicts = df_train[X_columns].to_dict(orient = 'records')
val_dicts = df_val[X_columns].to_dict(orient = 'records')

In [None]:
y_train = df_train['Class'].values.ravel() # Converts to a 1D numpy array
y_val = df_val['Class'].values.ravel()
y_test = df_test['Class'].values.ravel()

# One hot encoding

In [None]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
dv = DictVectorizer(sparse = False)

In [None]:
train_dicts = df_train.to_dict(orient = 'records')
val_dicts = df_val.to_dict(orient = 'records')
test_dicts = df_test.to_dict(orient = 'records')

In [None]:
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
X_test = dv.transform(test_dicts)

In [None]:
X_train

In [None]:
X_val

In [None]:
X_test

# Modelling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    precision_recall_curve, 
    auc
)
import xgboost as xgb

In [None]:
classifier = {
    "logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}
# deleted conditions for 3 models:
# random_state=1
# max_depth=10, random_state=1
# n_estimators=10, random_state=1, n_jobs=-1

In [None]:
for name, clf in classifier.items():
    print(f'\n==========={name}===========')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    print(f'\n Accuracy: {accuracy_score(y_val, y_pred)}')
    print(f'\n classification_report: {classification_report(y_val, y_pred)}')
    print(f'\n confusion_matrix: {confusion_matrix(y_val, y_pred)}')
    print(f'\n roc_auc_score: {roc_auc_score(y_val, y_pred)}')


In [None]:
# Since decision tree and random forest have the same accuracy, 
# This project will go with the decision tree model for simplicity.

# DECISION TREE MODEL

In [None]:
dtc = DecisionTreeClassifier(max_depth = 10)
dtc.fit(X_train, y_train)

In [None]:
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(15,10))
sns.heatmap(cm, annot = True)
plt.show()

In [None]:
y_pred = dtc.predict(X_test)  

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))

In [None]:
### Create a Pickle file using serialization 
import pickle
pickle_out = open("credit_card_fraud_detection.pkl","wb")
pickle.dump(dv, pickle_out)
pickle.dump(dtc, pickle_out)
pickle_out.close()

# Validating PKL file

In [None]:
import pickle

with open('credit_card_fraud_detection.pkl', 'rb') as f:
    loaded_object = pickle.load(f)

print(f"Type of loaded object: {type(loaded_object)}")

# If you think it should be a DataFrame
try:
    print(f"Shape of DataFrame: {loaded_object.shape}")
except AttributeError:
    pass # Not a DataFrame

# If you think it should be a Decision Tree model
try:
    print(f"Model coefficients shape: {loaded_object.coef_.shape}")
except AttributeError:
    pass # Not a DecisionTreeClassifier