# Exploratory Data Analysis for Fraud Detection

This notebook performs exploratory data analysis on the fraud detection datasets:
1. Fraud_Data.csv - E-commerce transaction data
2. IpAddress_to_Country.csv - IP to country mapping
3. creditcard.csv - Bank transaction data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add the src directory to the path
sys.path.append(os.path.abspath('../src'))

# Import custom EDA module
from eda import (
    load_datasets, display_basic_info, analyze_class_distribution,
    plot_numerical_distributions, analyze_categorical_features,
    analyze_time_patterns, analyze_correlation_matrix, analyze_ip_country_data
)

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Display all columns
pd.set_option('display.max_columns', None)

## 1. Load the Datasets

In [None]:
# Load the datasets using the function from eda.py
fraud_data, ip_country, creditcard = load_datasets()

## 2. Explore Fraud_Data.csv

In [None]:
# Display basic information about Fraud_Data.csv
display_basic_info(fraud_data, "Fraud_Data.csv")

In [None]:
# Analyze class distribution
analyze_class_distribution(fraud_data, class_col='class')

In [None]:
# Plot numerical distributions
numerical_cols = ['purchase_value', 'age']
plot_numerical_distributions(fraud_data, numerical_cols, class_col='class')

In [None]:
# Analyze categorical features
categorical_cols = ['source', 'browser', 'sex']
analyze_categorical_features(fraud_data, categorical_cols, class_col='class')

In [None]:
# Analyze time patterns
fraud_data = analyze_time_patterns(fraud_data)

## 3. Explore IpAddress_to_Country.csv

In [None]:
# Display basic information about IpAddress_to_Country.csv
display_basic_info(ip_country, "IpAddress_to_Country.csv")

In [None]:
# Analyze IP to country mapping data
country_counts = analyze_ip_country_data(ip_country)

## 4. Explore creditcard.csv

In [None]:
# Display basic information about creditcard.csv
display_basic_info(creditcard, "creditcard.csv")

In [None]:
# Analyze class distribution
analyze_class_distribution(creditcard, class_col='Class')

In [None]:
# Plot numerical distributions
cc_numerical_cols = ['Amount', 'Time']
plot_numerical_distributions(creditcard, cc_numerical_cols, class_col='Class')

In [None]:
# Analyze correlation matrix and feature importance
target_corr = analyze_correlation_matrix(creditcard, target_col='Class')

In [None]:
# Display top features by correlation with target
print("Top 10 features positively correlated with fraud:")
display(target_corr.head(10))

print("\nTop 10 features negatively correlated with fraud:")
display(target_corr.tail(10))

## 5. Summary of Findings

### Fraud_Data.csv
- [Add your findings here after running the analysis]

### IpAddress_to_Country.csv
- [Add your findings here after running the analysis]

### creditcard.csv
- [Add your findings here after running the analysis]