# Credit Card Fraud

## Problem Statement
[Describe the challenge and objectives]

## Data Description
[Describe the data you'll be working with]

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
import xgboost as xgb


# Set display options
pd.set_option('display.max_columns', None)
%matplotlib inline

ModuleNotFoundError: No module named 'xgboost'

## Data Loading and Exploration

In [None]:
# Load Dataset
data = pd.read_csv('/Users/liangqunlu/Downloads/creditcard 2.csv')

In [None]:
# Exploratory Data Analysis (EDA)
print(data.info())
print(data.describe())

print(data.isna().sum())

data.head()

# EDA 

In [None]:
# Check for class imbalance
fraud_count = data['Class'].value_counts()
fraud_count.plot(kind='bar', title='Class Distribution', figsize=(10, 5))
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

# Plot correlation heatmap to understand feature relationships
plt.figure(figsize=(15, 10))
sns.heatmap(data.corr(), cmap='coolwarm', annot=False)
plt.title('Correlation Heatmap')
plt.show()

# Plot distribution of 'Amount'
plt.figure(figsize=(10, 5))
sns.histplot(data['Amount'], bins=50, kde=True)
plt.title('Transaction Amount Distribution')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()


## Feature Engineering

In [None]:
# Feature engineering code here

# Data Preprocessing
features = data.drop(columns=['Class'])
labels = data['Class']

## Model Building

In [None]:
# Model building code here

# Standardize 'Amount' and 'Time' features
scaler = StandardScaler()
features[['Amount', 'Time']] = scaler.fit_transform(features[['Amount', 'Time']])

# Split Data into Training and Validation Sets
X_train, X_valid, y_train, y_valid = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)

# Handle Imbalance Using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define XGBoost Classifier Model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)


In [None]:
# Train the Model
xgb_model.fit(X_train_resampled, y_train_resampled)

# Predict on Validation Set
y_pred = xgb_model.predict(X_valid)

# Evaluate Model Performance
print("Classification Report:\n", classification_report(y_valid, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))
roc_auc = roc_auc_score(y_valid, y_pred)
print(f'Validation ROC-AUC Score: {roc_auc:.2f}')

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_valid, xgb_model.predict_proba(X_valid)[:, 1])
plt.figure(figsize=(10, 5))
plt.plot(fpr, tpr, color='b', label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='r', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

# Hyperparameter Tuning using Grid Search
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2]
}
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='roc_auc', cv=skf, verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)

# Best Parameters
print(f'Best Parameters: {grid_search.best_params_}')

# Train Final Model with Best Parameters
final_model = grid_search.best_estimator_
final_model.fit(X_train_resampled, y_train_resampled)

# Predict on Validation Set with Final Model
y_final_pred = final_model.predict(X_valid)

# Evaluate Final Model Performance
final_roc_auc = roc_auc_score(y_valid, y_final_pred)
print(f'Final Validation ROC-AUC Score: {final_roc_auc:.2f}')

# Final Classification Report
print("Final Classification Report:\n", classification_report(y_valid, y_final_pred))


## Analysis

In [None]:
# Your analysis code here

## Conclusions and Recommendations

[Your conclusions and recommendations here]