In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score , precision_score , recall_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.tail()

In [None]:
df.describe().T

In [None]:
df.nunique()

# EDA

In [None]:
df.corr()['Class'].sort_values(ascending=False)

In [None]:
df.duplicated().sum()

In [None]:
df=df.drop_duplicates()

In [None]:
df['Class'].value_counts()

In [None]:
fraud = df[df['Class'] == 1]
non_fraud = df[df['Class'] == 0]

In [None]:
fraud.Amount.describe()

In [None]:
non_fraud.Amount.describe()

# Class Distribution (Fraud vs Non-Fraud)

In [None]:
class_counts = df['Class'].value_counts()
class_labels = ['Non-Fraud', 'Fraud']
plt.figure(figsize=(6, 4))
class_counts.plot(kind='bar', color=['skyblue', 'orange'])
plt.title('Class Distribution')
plt.xticks(ticks=[0, 1], labels=class_labels, rotation=0)
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
sns.heatmap(df.corr(), cmap='coolwarm', annot=False, fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.histplot(df['Time'], bins=50, kde=True, color='blue')
plt.title('Distribution of Time')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.histplot(fraud['Time'], bins=50, kde=True, color='orange')
plt.title('Fraud Transactions by Time')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.show()

# Outliers

In [None]:
features = ['Amount', 'V11', 'V2', 'V17', 'V4']
for feature in features:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[feature], color='skyblue')
    plt.title(f'Boxplot of {feature}')
    plt.xlabel(feature)
    plt.show()

In [None]:
X=df.drop(columns='Class')

y=df['Class']

In [None]:
class_counts = y.value_counts()
class_labels = ['Non-Fraud', 'Fraud']
plt.figure(figsize=(6, 4))
class_counts.plot(kind='bar', color=['skyblue', 'orange'])
plt.title('Class Distribution')
plt.xticks(ticks=[0, 1], labels=class_labels, rotation=0)
plt.ylabel('Count')
plt.show()

# Challenges of Imbalanced Data

Model Bias:

Most models optimize for overall accuracy, which means they may ignore the minority class entirely.
Example: A model predicting "Non-Fraud" for all transactions would achieve 99.8% accuracy but fail to identify fraud.

Metrics Misrepresentation:

Accuracy alone is not a reliable metric for imbalanced datasets.
Precision, recall, F1-score, and confusion matrix become more important.

Why Use stratify=y?

stratify=y ensures that the class distribution in the y target variable is maintained in both training and test sets.

Stratified Splitting:

Preserves the proportion of each class during the split.

Ensures the training and test sets are representative of the overall dataset.

Importance for Imbalanced Datasets:

Prevents underrepresented classes from being excluded in one of the subsets.

Leads to fair evaluation by maintaining consistent class proportions.ss.

# Model

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=.2,random_state=42, stratify=y)

# SMOTE (Synthetic Minority Oversampling Technique): 
is a resampling method used to address class imbalance in datasets by creating synthetic examples of the minority class.



# Benefits of Using SMOTE:

Improves Model Performance on Imbalanced Data:

Balances the class distribution, helping the model learn patterns from the minority class.
Reduces the risk of the model being biased toward the majority class.
Avoids Overfitting:

Unlike random oversampling, which duplicates existing data, SMOTE creates new, slightly varied data points, reducing overfitting.
Maintains Feature Relationships:

Synthetic samples are based on existing data, ensuring that the feature space remains realistic.

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Logistic Regression

In [None]:
LG = LogisticRegression()
LG.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred_Test = LG.predict(X_test)
y_predTrain=LG.predict(X_train_resampled)

In [None]:
y_predTest=LG.predict(X_test)
y_predTrain=LG.predict(X_train_resampled)

In [None]:
print("\nAccuracy Score:")

print(f"Train Accuracy: {accuracy_score(y_train_resampled, y_predTrain)}")

print(f"Test Accuracy: {accuracy_score(y_test, y_predTest)}")

In [None]:
print("\nRecall Score:")

print(f"Train Recall: {recall_score(y_train_resampled, y_predTrain)}")

print(f"Test Recall: {recall_score(y_test, y_predTest)}")

In [None]:
print("\nClassification Report (Test):")
print(classification_report(y_test, y_predTest))

In [None]:
print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_predTest))

# XGBClassifier

In [None]:
model = XGBClassifier(
    n_estimators=500,          # Reduce the number of trees
    learning_rate=0.1,         # Use a smaller learning rate
    max_depth=4,               # Reduce tree depth
    min_child_weight=3,        # Increase minimum child weight
    subsample=0.7,             # Use a subset of the data for training
    colsample_bytree=0.7,      # Use a subset of features for training
    gamma=1,                   # Minimum loss reduction required for split
    reg_alpha=2,               # Increase L1 regularization
    reg_lambda=2               # Increase L2 regularization
)
model.fit(X_train_resampled,y_train_resampled)

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
y_predTest=model.predict(X_test)
y_predTrain=model.predict(X_train_resampled)

# Evaluation

In [None]:
print("\nAccuracy Score:")
print(f"Train Accuracy: {accuracy_score(y_train_resampled, y_predTrain)}")
print(f"Test Accuracy: {accuracy_score(y_test, y_predTest)}")
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f}")

In [None]:
print("\nPrecision Score:")

print(f"Train Precision: {precision_score(y_train_resampled, y_predTrain)}")

print(f"Test Precision: {precision_score(y_test, y_predTest)}")


In [None]:
print("\nRecall Score:")

print(f"Train Recall: {recall_score(y_train_resampled, y_predTrain)}")

print(f"Test Recall: {recall_score(y_test, y_predTest)}")


In [None]:
print("\nClassification Report (Test):")

print(classification_report(y_test, y_predTest))


In [None]:
print("Confusion Matrix (Test):")

print(confusion_matrix(y_test, y_predTest))
