{ “cells”: \[ { “cell_type”: “markdown”, “id”: “fraud-analysis-title”,
“metadata”: {}, “source”: \[ “\# Fraud Detection - Exploratory Data
Analysis”, “”, “This notebook provides exploratory data analysis for the
fraud detection system.”, “”, “\## Contents”, “1. Data Loading and
Overview”, “2. Fraud Distribution Analysis”, “3. Feature Analysis”, “4.
Transaction Patterns”, “5. Model Performance Analysis” \] }, {
“cell_type”: “code”, “execution_count”: null, “id”: “imports”,
“metadata”: {}, “outputs”: \[\], “source”: \[ “import pandas as pd”,
“import numpy as np”, “import matplotlib.pyplot as plt”, “import seaborn
as sns”, “import plotly.express as px”, “import plotly.graph_objects as
go”, “from plotly.subplots import make_subplots”, “”, “\# Set style”,
“plt.style.use(‘seaborn-v0_8’)”, “sns.set_palette("husl")”, “”, “\#
Configure pandas display”, “pd.set_option(‘display.max_columns’, None)”,
“pd.set_option(‘display.width’, None)”, “”, “print("✅ Imports
successful")” \] }, { “cell_type”: “markdown”, “id”: “data-loading”,
“metadata”: {}, “source”: \[ “\## 1. Data Loading and Overview” \] }, {
“cell_type”: “code”, “execution_count”: null, “id”: “load-data”,
“metadata”: {}, “outputs”: \[\], “source”: \[ “\# Generate sample data
if no data file exists”, “import sys”, “sys.path.append(‘../src’)”, “”,
“from data.data_generator import FraudDataGenerator”, “”, “\# Generate
synthetic data”, “generator = FraudDataGenerator(seed=42)”, “df =
generator.generate_dataset(n_samples=10000, n_users=1000)”, “”,
“print(f"Dataset shape: {df.shape}")”, “print(f"Fraud rate:
{df\[‘is_fraud’\].mean():.3%}")”, “”, “df.head()” \] }, { “cell_type”:
“code”, “execution_count”: null, “id”: “data-info”, “metadata”: {},
“outputs”: \[\], “source”: \[ “\# Dataset information”, “print("Dataset
Info:")”, “print(f"Total transactions: {len(df):,}")”, “print(f"Unique
users: {df\[‘user_id’\].nunique():,}")”, “print(f"Unique merchants:
{df\[‘merchant_id’\].nunique():,}")”, “print(f"Date range:
{df\[‘timestamp’\].min()} to {df\[‘timestamp’\].max()}")”, “”, “\#
Missing values”, “print("\nMissing values:")”,
“print(df.isnull().sum())” \] }, { “cell_type”: “markdown”, “id”:
“fraud-distribution”, “metadata”: {}, “source”: \[ “\## 2. Fraud
Distribution Analysis” \] }, { “cell_type”: “code”, “execution_count”:
null, “id”: “fraud-overview”, “metadata”: {}, “outputs”: \[\], “source”:
\[ “\# Fraud distribution”, “fraud_counts =
df\[‘is_fraud’\].value_counts()”, “fraud_pct =
df\[‘is_fraud’\].value_counts(normalize=True) \* 100”, “”, “fig, (ax1,
ax2) = plt.subplots(1, 2, figsize=(15, 5))”, “”, “\# Count plot”,
“fraud_counts.plot(kind=‘bar’, ax=ax1, color=\[‘green’, ‘red’\])”,
“ax1.set_title(‘Fraud Distribution (Count)’)”, “ax1.set_xlabel(‘Is
Fraud’)”, “ax1.set_ylabel(‘Count’)”, “ax1.set_xticklabels(\[‘Normal’,
‘Fraud’\], rotation=0)”, “”, “\# Percentage pie chart”,
“ax2.pie(fraud_pct.values, labels=\[‘Normal’, ‘Fraud’\],
autopct=‘%1.2f%%’, ”, ” colors=\[‘green’,
‘red’\])“,”ax2.set_title(‘Fraud Distribution
(Percentage)’)“,”“,”plt.tight_layout()“,”plt.show()“,”“,”print(f"Normal
transactions: {fraud_counts\[0\]:,}
({fraud_pct\[0\]:.2f}%)")“,”print(f"Fraudulent transactions:
{fraud_counts\[1\]:,} ({fraud_pct\[1\]:.2f}%)")” \] }, { “cell_type”:
“markdown”, “id”: “feature-analysis”, “metadata”: {}, “source”: \[ “\##
3. Feature Analysis” \] }, { “cell_type”: “code”, “execution_count”:
null, “id”: “amount-analysis”, “metadata”: {}, “outputs”: \[\],
“source”: \[ “\# Amount analysis”, “fig, axes = plt.subplots(2, 2,
figsize=(15, 12))”, “”, “\# Amount distribution by fraud status”,
“df.boxplot(column=‘amount’, by=‘is_fraud’, ax=axes\[0,0\])”,
“axes\[0,0\].set_title(‘Amount Distribution by Fraud Status’)”,
“axes\[0,0\].set_xlabel(‘Is Fraud’)”, “”, “\# Log amount distribution”,
“df\[‘log_amount’\] = np.log1p(df\[‘amount’\])”,
“axes\[0,1\].hist(df\[df\[‘is_fraud’\]==0\]\[‘log_amount’\], alpha=0.7,
label=‘Normal’, bins=50)”,
“axes\[0,1\].hist(df\[df\[‘is_fraud’\]==1\]\[‘log_amount’\], alpha=0.7,
label=‘Fraud’, bins=50)”, “axes\[0,1\].set_title(‘Log Amount
Distribution’)”, “axes\[0,1\].legend()”, “”, “\# Amount by merchant
category”, “category_fraud =
df.groupby(‘merchant_category’)\[‘is_fraud’\].agg(\[‘count’, ‘mean’\])”,
“category_fraud.columns = \[‘transaction_count’, ‘fraud_rate’\]”,
“category_fraud = category_fraud.sort_values(‘fraud_rate’,
ascending=False)”, “”, “category_fraud\[‘fraud_rate’\].plot(kind=‘bar’,
ax=axes\[1,0\])”, “axes\[1,0\].set_title(‘Fraud Rate by Merchant
Category’)”, “axes\[1,0\].set_ylabel(‘Fraud Rate’)”,
“axes\[1,0\].tick_params(axis=‘x’, rotation=45)”, “”, “\# Amount
statistics”, “amount_stats =
df.groupby(‘is_fraud’)\[‘amount’\].describe()”,
“axes\[1,1\].axis(‘off’)”,
“axes\[1,1\].table(cellText=amount_stats.round(2).values,”, ”
rowLabels=\[‘Normal’, ‘Fraud’\],“,” colLabels=amount_stats.columns,“,”
cellLoc=‘center’,“,” loc=‘center’)“,”axes\[1,1\].set_title(‘Amount
Statistics by Fraud Status’)“,”“,”plt.tight_layout()“,”plt.show()” \] },
{ “cell_type”: “code”, “execution_count”: null, “id”:
“temporal-analysis”, “metadata”: {}, “outputs”: \[\], “source”: \[ “\#
Temporal analysis”, “df\[‘timestamp’\] =
pd.to_datetime(df\[‘timestamp’\])”, “df\[‘hour’\] =
df\[‘timestamp’\].dt.hour”, “df\[‘day_of_week’\] =
df\[‘timestamp’\].dt.day_name()”, “df\[‘is_weekend’\] =
df\[‘timestamp’\].dt.dayofweek \>= 5”, “”, “fig, axes = plt.subplots(2,
2, figsize=(15, 12))”, “”, “\# Fraud rate by hour”, “hourly_fraud =
df.groupby(‘hour’)\[‘is_fraud’\].mean()”,
“hourly_fraud.plot(kind=‘line’, marker=‘o’, ax=axes\[0,0\])”,
“axes\[0,0\].set_title(‘Fraud Rate by Hour of Day’)”,
“axes\[0,0\].set_xlabel(‘Hour’)”, “axes\[0,0\].set_ylabel(‘Fraud
Rate’)”, “axes\[0,0\].grid(True)”, “”, “\# Fraud rate by day of week”,
“day_order = \[‘Monday’, ‘Tuesday’, ‘Wednesday’, ‘Thursday’, ‘Friday’,
‘Saturday’, ‘Sunday’\]”, “daily_fraud =
df.groupby(‘day_of_week’)\[‘is_fraud’\].mean().reindex(day_order)”,
“daily_fraud.plot(kind=‘bar’, ax=axes\[0,1\])”,
“axes\[0,1\].set_title(‘Fraud Rate by Day of Week’)”,
“axes\[0,1\].set_xlabel(‘Day of Week’)”, “axes\[0,1\].set_ylabel(‘Fraud
Rate’)”, “axes\[0,1\].tick_params(axis=‘x’, rotation=45)”, “”, “\#
Weekend vs weekday”, “weekend_fraud =
df.groupby(‘is_weekend’)\[‘is_fraud’\].mean()”,
“weekend_fraud.plot(kind=‘bar’, ax=axes\[1,0\])”,
“axes\[1,0\].set_title(‘Fraud Rate: Weekend vs Weekday’)”,
“axes\[1,0\].set_xlabel(‘Is Weekend’)”,
“axes\[1,0\].set_xticklabels(\[‘Weekday’, ‘Weekend’\], rotation=0)”, “”,
“\# Transaction volume by hour”, “hourly_volume =
df.groupby(‘hour’).size()”, “hourly_volume.plot(kind=‘bar’,
ax=axes\[1,1\], alpha=0.7)”, “axes\[1,1\].set_title(‘Transaction Volume
by Hour’)”, “axes\[1,1\].set_xlabel(‘Hour’)”,
“axes\[1,1\].set_ylabel(‘Transaction Count’)”, “”, “plt.tight_layout()”,
“plt.show()” \] }, { “cell_type”: “markdown”, “id”:
“geographic-analysis”, “metadata”: {}, “source”: \[ “\## 4. Geographic
Analysis” \] }, { “cell_type”: “code”, “execution_count”: null, “id”:
“location-analysis”, “metadata”: {}, “outputs”: \[\], “source”: \[ “\#
Extract location data”, “df\[‘lat’\] = df\[‘location’\].apply(lambda x:
x.get(‘lat’, 0) if isinstance(x, dict) else 0)”, “df\[‘lon’\] =
df\[‘location’\].apply(lambda x: x.get(‘lon’, 0) if isinstance(x, dict)
else 0)”, “”, “\# Create interactive map using plotly”, “fig =
px.scatter_mapbox(”, ” df.sample(1000), \# Sample for performance“,”
lat="lat", “,” lon="lon",“,” color="is_fraud",“,” color_discrete_map={0:
‘green’, 1: ‘red’},“,” hover_data=\["amount", "merchant_category"\],“,”
mapbox_style="open-street-map",“,” title="Transaction Locations
(Sample)",“,”
height=600“,”)“,”“,”fig.update_layout(margin={"r":0,"t":50,"l":0,"b":0})“,”fig.show()”
\] }, { “cell_type”: “markdown”, “id”: “correlation-analysis”,
“metadata”: {}, “source”: \[ “\## 5. Correlation Analysis” \] }, {
“cell_type”: “code”, “execution_count”: null, “id”: “correlations”,
“metadata”: {}, “outputs”: \[\], “source”: \[ “\# Feature engineering
for correlation analysis”, “from features.feature_engineer import
FeatureEngineer”, “”, “engineer = FeatureEngineer()”, “features_df =
engineer.create_features(df)”, “”, “\# Select numeric features for
correlation”, “numeric_features =
features_df.select_dtypes(include=\[np.number\]).columns”,
“correlation_matrix = features_df\[numeric_features\].corr()”, “”, “\#
Plot correlation heatmap”, “plt.figure(figsize=(20, 16))”, “mask =
np.triu(np.ones_like(correlation_matrix, dtype=bool))”,
“sns.heatmap(correlation_matrix, mask=mask, annot=False,
cmap=‘coolwarm’, center=0)”, “plt.title(‘Feature Correlation Matrix’)”,
“plt.tight_layout()”, “plt.show()”, “”, “\# Top correlations with
fraud”, “if ‘is_fraud’ in correlation_matrix.columns:”, ”
fraud_correlations =
correlation_matrix\[‘is_fraud’\].abs().sort_values(ascending=False)“,”
print("Top 15 features correlated with fraud:")“,”
print(fraud_correlations.head(15))” \] }, { “cell_type”: “markdown”,
“id”: “model-analysis”, “metadata”: {}, “source”: \[ “\## 6. Model
Performance Analysis” \] }, { “cell_type”: “code”, “execution_count”:
null, “id”: “model-performance”, “metadata”: {}, “outputs”: \[\],
“source”: \[ “\# Train a simple model for analysis”, “from
sklearn.model_selection import train_test_split”, “from sklearn.ensemble
import RandomForestClassifier”, “from sklearn.metrics import
classification_report, roc_curve, precision_recall_curve”, “”, “\#
Prepare data”, “X =
features_df.select_dtypes(include=\[np.number\]).fillna(0)”, “y =
df\[‘is_fraud’\]”, “”, “\# Remove target from features if present”, “if
‘is_fraud’ in X.columns:”, ” X = X.drop(‘is_fraud’,
axis=1)“,”“,”X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42, stratify=y)“,”“,”\# Train model“,”model
= RandomForestClassifier(n_estimators=100,
random_state=42)“,”model.fit(X_train, y_train)“,”“,”\#
Predictions“,”y_pred = model.predict(X_test) y_pred_proba =
model.predict_proba(X_test)\[:, 1\]

print(“Classification Report:”) print(classification_report(y_test,
y_pred))

# Performance plots

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# ROC Curve

fpr, tpr, \_ = roc_curve(y_test, y_pred_proba) roc_auc =
roc_auc_score(y_test, y_pred_proba)

axes\[0,0\].plot(fpr, tpr, color=‘darkorange’, lw=2, label=f’ROC curve
(AUC = {roc_auc:.3f})‘) axes\[0,0\].plot(\[0, 1\], \[0, 1\],
color=’navy’, lw=2, linestyle=‘–’) axes\[0,0\].set_xlabel(‘False
Positive Rate’) axes\[0,0\].set_ylabel(‘True Positive Rate’)
axes\[0,0\].set_title(‘ROC Curve’) axes\[0,0\].legend()

# Precision-Recall Curve

precision, recall, \_ = precision_recall_curve(y_test, y_pred_proba)
pr_auc = average_precision_score(y_test, y_pred_proba)

axes\[0,1\].plot(recall, precision, color=‘blue’, lw=2, label=f’PR curve
(AUC = {pr_auc:.3f})‘) axes\[0,1\].set_xlabel(’Recall’)
axes\[0,1\].set_ylabel(‘Precision’)
axes\[0,1\].set_title(‘Precision-Recall Curve’) axes\[0,1\].legend()

# Feature Importance

feature_importance = pd.Series(model.feature_importances\_,
index=X.columns).sort_values(ascending=False)
feature_importance.head(15).plot(kind=‘barh’, ax=axes\[1,0\])
axes\[1,0\].set_title(‘Top 15 Feature Importances’)

# Prediction Distribution

axes\[1,1\].hist(y_pred_proba\[y_test==0\], bins=50, alpha=0.7,
label=‘Normal’, color=‘green’)
axes\[1,1\].hist(y_pred_proba\[y_test==1\], bins=50, alpha=0.7,
label=‘Fraud’, color=‘red’) axes\[1,1\].set_xlabel(‘Predicted
Probability’) axes\[1,1\].set_ylabel(‘Frequency’)
axes\[1,1\].set_title(‘Prediction Distribution’) axes\[1,1\].legend()

plt.tight_layout() plt.show()

print(f"\nModel Performance Summary:")(f"ROC AUC: {roc_auc:.4f}")(f"PR
AUC: {pr_auc:.4f}")(f"Precision: {precision_score(y_test,
y_pred):.4f}")(f"Recall: {recall_score(y_test, y_pred):.4f}")” \] }, {
“cell_type”: “markdown”, “id”: “business-impact”, “metadata”: {},
“source”: \[ “\## 7. Business Impact Analysis” \] }, { “cell_type”:
“code”, “execution_count”: null, “id”: “business-metrics”, “metadata”:
{}, “outputs”: \[\], “source”: \[ “\# Calculate business metrics”,
“total_fraud_amount = df\[df\[‘is_fraud’\] == 1\]\[‘amount’\].sum()”,
“avg_fraud_amount = df\[df\[‘is_fraud’\] == 1\]\[‘amount’\].mean()”,
“avg_normal_amount = df\[df\[‘is_fraud’\] == 0\]\[‘amount’\].mean()”,
“”, “print("Business Impact Metrics:")”, “print(f"Total fraudulent
amount: \${total_fraud_amount:,.2f}")”, “print(f"Average fraud
transaction: \${avg_fraud_amount:.2f}")”, “print(f"Average normal
transaction: \${avg_normal_amount:.2f}")”, “print(f"Fraud amount vs
normal ratio: {avg_fraud_amount/avg_normal_amount:.2f}x")”, “”, “\#
Model savings calculation”, “if len(y_test) \> 0:”, ” detected_fraud =
sum((y_test == 1) & (y_pred == 1))“,” missed_fraud = sum((y_test == 1) &
(y_pred == 0))“,” false_alarms = sum((y_test == 0) & (y_pred == 1))“,”
“,” \# Estimate savings (assuming we catch detected fraud)“,”
estimated_savings = detected_fraud \* avg_fraud_amount“,”
estimated_losses = missed_fraud \* avg_fraud_amount“,” “,”
print(f"\nModel Impact (on test set):")“,” print(f"Fraud transactions
detected: {detected_fraud}")“,” print(f"Fraud transactions missed:
{missed_fraud}")“,” print(f"False alarms: {false_alarms}")“,”
print(f"Estimated fraud prevented: \${estimated_savings:,.2f}")“,”
print(f"Estimated losses (missed): \${estimated_losses:,.2f}")” \] }, {
“cell_type”: “markdown”, “id”: “recommendations”, “metadata”: {},
“source”: \[ “\## 8. Key Insights and Recommendations” \] }, {
“cell_type”: “markdown”, “id”: “insights”, “metadata”: {}, “source”: \[
“\### Key Insights:”, “”, “1. **Fraud Pattern**: Fraudulent transactions
show distinct patterns in:”, ” - Higher average amounts“,” - Specific
time periods (night/weekend)“,” - Certain merchant categories“,” -
Geographic clustering“,”“,”2. **Model Performance**: “,” - Achieves good
separation between fraud and normal transactions“,” - Feature importance
shows amount-based and behavioral features are key“,” - Precision-recall
trade-off needs business consideration“,”“,”3. **Business Impact**:“,” -
Significant financial exposure from fraud“,” - Model can prevent
substantial losses“,” - False positives impact customer
experience“,”“,”\### Recommendations:“,”“,”1. **Feature
Enhancement**:“,” - Add more behavioral features (velocity,
patterns)“,” - Include network analysis (merchant/user connections)“,” -
Enhance location-based features“,”“,”2. **Model Improvements**:“,” -
Ensemble methods for better performance“,” - Regular retraining with new
data“,” - Threshold optimization for business objectives“,”“,”3.
**Operational**:“,” - Real-time monitoring dashboard“,” - Alert system
for high-risk transactions“,” - Feedback loop for model
improvement“,”“,”4. **Business Rules**:“,” - Risk-based transaction
limits“,” - Enhanced verification for high-risk transactions“,” -
Customer communication for false positives” \] }, { “cell_type”: “code”,
“execution_count”: null, “id”: “save-results”, “metadata”: {},
“outputs”: \[\], “source”: \[ “\# Save analysis results”, “import os”,
“”, “\# Create reports directory”, “os.makedirs(‘../reports’,
exist_ok=True)”, “”, “\# Save key metrics”, “analysis_summary = {”, ”
‘dataset_size’: len(df),“,” ‘fraud_rate’: df\[‘is_fraud’\].mean(),“,”
‘total_fraud_amount’: total_fraud_amount,“,” ‘avg_fraud_amount’:
avg_fraud_amount,“,” ‘model_roc_auc’: roc_auc if ‘roc_auc’ in locals()
else None,“,” ‘model_pr_auc’: pr_auc if ‘pr_auc’ in locals() else
None“,”}“,”“,”\# Save to JSON“,”import json“,”with
open(‘../reports/eda_summary.json’, ‘w’) as f:“,”
json.dump(analysis_summary, f, indent=2, default=str)“,”“,”\# Save
feature importance“,”if ‘feature_importance’ in locals():“,”
feature_importance.to_csv(‘../reports/feature_importance.csv’)“,”“,”print("✅
Analysis results saved to reports/ directory")” \] } \], “metadata”: {
“kernelspec”: { “display_name”: “Python 3 (ipykernel)”, “language”:
“python”, “name”: “python3” }, “language_info”: { “codemirror_mode”: {
“name”: “ipython”, “version”: 3 }, “file_extension”: “.py”, “mimetype”:
“text/x-python”, “name”: “python”, “nbconvert_exporter”: “python”,
“pygments_lexer”: “ipython3”, “version”: “3.9.18” } }, “nbformat”: 4,
“nbformat_minor”: 5 }