# Credit Card Fraud Detection - Data Cleaning & EDA
## Professional Data Pipeline for Fraud Detection Analysis

**Author:** Data Science Team  
**Date:** February 2026  
**Dataset:** Credit Card Transactions (10,000 records)

## 1. Setup and Imports

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, ks_2samp, mannwhitneyu
import warnings
import os
from io import StringIO

warnings.filterwarnings('ignore')

# Configuration
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("✓ Libraries imported successfully")
print("✓ Configuration set")

## 2. Load Data

In [None]:
# Define file paths
input_filepath = 'Raw_Data/credit_card_fraud_10k.csv'
output_filepath = 'Cleaned_Data/credit_card_fraud_10k_cleaned.csv'

# Create output directory if it doesn't exist
output_dir = os.path.dirname(output_filepath)
if output_dir and not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"✓ Created output directory: {output_dir}")

# Load data
print(f"Loading data from: {input_filepath}")
df = pd.read_csv(input_filepath)
print(f"✓ Loaded {len(df)} records from file")
print("="*80)
print("FRAUD DETECTION DATA PIPELINE - INITIALIZED")
print("="*80)

## 3. Data Quality Assessment
### Step 1: Schema Validation

In [None]:
print("\n[STEP 1] SCHEMA VALIDATION")
print("-" * 80)

expected_schema = {
    'transaction_id': 'int64',
    'amount': 'float64',
    'transaction_hour': 'int64',
    'merchant_category': 'object',
    'foreign_transaction': 'int64',
    'location_mismatch': 'int64',
    'device_trust_score': 'int64',
    'velocity_last_24h': 'int64',
    'cardholder_age': 'int64',
    'is_fraud': 'int64'
}

schema_valid = True
for col, dtype in expected_schema.items():
    if col not in df.columns:
        print(f"  ✗ Missing column: {col}")
        schema_valid = False
    elif df[col].dtype != dtype:
        print(f"  ⚠ Column {col}: Expected {dtype}, got {df[col].dtype}")

if schema_valid:
    print("  ✓ Schema validation PASSED")

### Step 2: Missing Value Analysis

In [None]:
print("\n[STEP 2] MISSING VALUE ANALYSIS")
print("-" * 80)

missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum().values,
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).values,
    'Data_Type': df.dtypes.values
})

print(missing_summary.to_string(index=False))

total_missing = df.isnull().sum().sum()
if total_missing == 0:
    print("\n  ✓ No missing values detected")
else:
    print(f"\n  ⚠ Total missing values: {total_missing}")

### Step 3: Duplicate Detection

In [None]:
print("\n[STEP 3] DUPLICATE DETECTION")
print("-" * 80)

# Check duplicate rows
duplicate_rows = df.duplicated()
dup_count = duplicate_rows.sum()

# Check duplicate transaction IDs
dup_ids = df['transaction_id'].duplicated()
dup_id_count = dup_ids.sum()

print(f"  Duplicate Rows: {dup_count} ({dup_count/len(df)*100:.2f}%)")
print(f"  Duplicate Transaction IDs: {dup_id_count}")

if dup_count > 0:
    print("\n  Duplicate Records:")
    print(df[duplicate_rows])
else:
    print("\n  ✓ No duplicates detected")

### Step 4: Domain Constraint Validation

In [None]:
print("\n[STEP 4] DOMAIN CONSTRAINT VALIDATION")
print("-" * 80)

validation_results = {}

# Transaction Hour: 0-23
invalid_hours = df[(df['transaction_hour'] < 0) | (df['transaction_hour'] > 23)]
validation_results['transaction_hour'] = len(invalid_hours)
print(f"  transaction_hour (0-23): {len(invalid_hours)} invalid values")

# Device Trust Score: 0-100
invalid_scores = df[(df['device_trust_score'] < 0) | (df['device_trust_score'] > 100)]
validation_results['device_trust_score'] = len(invalid_scores)
print(f"  device_trust_score (0-100): {len(invalid_scores)} invalid values")

# Amount: positive
invalid_amounts = df[df['amount'] <= 0]
validation_results['amount'] = len(invalid_amounts)
print(f"  amount (>0): {len(invalid_amounts)} invalid values")

# Velocity: non-negative
invalid_velocity = df[df['velocity_last_24h'] < 0]
validation_results['velocity'] = len(invalid_velocity)
print(f"  velocity_last_24h (≥0): {len(invalid_velocity)} invalid values")

# Age: reasonable range
invalid_age = df[(df['cardholder_age'] < 18) | (df['cardholder_age'] > 100)]
validation_results['cardholder_age'] = len(invalid_age)
print(f"  cardholder_age (18-100): {len(invalid_age)} invalid values")

# Binary fields: 0 or 1
binary_fields = ['foreign_transaction', 'location_mismatch', 'is_fraud']
for field in binary_fields:
    invalid_binary = df[~df[field].isin([0, 1])]
    validation_results[field] = len(invalid_binary)
    print(f"  {field} (0/1): {len(invalid_binary)} invalid values")

total_invalid = sum(validation_results.values())
if total_invalid == 0:
    print("\n  ✓ All domain constraints satisfied")
else:
    print(f"\n  ⚠ Total domain violations: {total_invalid}")

### Step 5: Outlier Detection

In [None]:
print("\n[STEP 5] OUTLIER DETECTION")
print("-" * 80)

numerical_features = ['amount', 'transaction_hour', 'device_trust_score', 
                     'velocity_last_24h', 'cardholder_age']

outlier_summary = []

for feature in numerical_features:
    # IQR Method
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    iqr_outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    
    # Z-Score Method (threshold = 3)
    z_scores = np.abs(stats.zscore(df[feature]))
    z_outliers = df[z_scores > 3]
    
    outlier_summary.append({
        'Feature': feature,
        'IQR_Outliers': len(iqr_outliers),
        'Z_Score_Outliers': len(z_outliers),
        'Lower_Bound': lower_bound,
        'Upper_Bound': upper_bound,
        'Min': df[feature].min(),
        'Max': df[feature].max()
    })

outlier_df = pd.DataFrame(outlier_summary)
print(outlier_df.to_string(index=False))

### Step 6: Statistical Profiling

In [None]:
print("\n[STEP 6] STATISTICAL PROFILING")
print("-" * 80)

numerical_features = ['amount', 'transaction_hour', 'device_trust_score', 
                     'velocity_last_24h', 'cardholder_age']

stats_profile = df[numerical_features].describe(
    percentiles=[.01, .05, .25, .5, .75, .95, .99]
).T

# Add additional statistics
stats_profile['skewness'] = df[numerical_features].skew()
stats_profile['kurtosis'] = df[numerical_features].kurtosis()
stats_profile['cv'] = stats_profile['std'] / stats_profile['mean']

print("\nNumerical Features Profile:")
print(stats_profile.round(3))

### Step 7: Target Variable Analysis

In [None]:
print("\n[STEP 7] TARGET VARIABLE ANALYSIS")
print("-" * 80)

fraud_dist = df['is_fraud'].value_counts().sort_index()
fraud_pct = df['is_fraud'].value_counts(normalize=True).sort_index() * 100

print("\nTarget Variable Distribution:")
print(f"  Normal (0): {fraud_dist.get(0, 0)} ({fraud_pct.get(0, 0):.2f}%)")
print(f"  Fraud (1):  {fraud_dist.get(1, 0)} ({fraud_pct.get(1, 0):.2f}%)")

if 1 in fraud_dist.index and 0 in fraud_dist.index:
    imbalance_ratio = fraud_dist[0] / fraud_dist[1]
    print(f"\n  Imbalance Ratio: {imbalance_ratio:.2f}:1")
    
    if imbalance_ratio > 10:
        print("  ⚠ HIGH IMBALANCE - Consider resampling techniques")
    elif imbalance_ratio > 5:
        print("  ⚠ MODERATE IMBALANCE - Monitor model performance")
    else:
        print("  ✓ ACCEPTABLE BALANCE")
elif 1 not in fraud_dist.index:
    print("\n  ⚠ CRITICAL: No fraud cases in dataset")

### Step 8: Categorical Feature Analysis

In [None]:
print("\n[STEP 8] CATEGORICAL FEATURE ANALYSIS")
print("-" * 80)

# Merchant Category
merchant_dist = df['merchant_category'].value_counts()
merchant_pct = df['merchant_category'].value_counts(normalize=True) * 100

print("\nMerchant Category Distribution:")
for cat, count in merchant_dist.items():
    print(f"  {cat:15s}: {count:4d} ({merchant_pct[cat]:5.2f}%)")

print(f"\n  Total unique categories: {df['merchant_category'].nunique()}")

### Step 9: Correlation Analysis

In [None]:
print("\n[STEP 9] CORRELATION ANALYSIS")
print("-" * 80)

numerical_cols = ['amount', 'transaction_hour', 'device_trust_score', 
                 'velocity_last_24h', 'cardholder_age', 
                 'foreign_transaction', 'location_mismatch', 'is_fraud']

corr_matrix = df[numerical_cols].corr()

print("\nCorrelation with Target Variable (is_fraud):")
target_corr = corr_matrix['is_fraud'].sort_values(ascending=False)
for feature, corr_val in target_corr.items():
    if feature != 'is_fraud':
        print(f"  {feature:25s}: {corr_val:7.4f}")

# High correlations between features (multicollinearity check)
print("\nHigh Feature Correlations (|r| > 0.7):")
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.7:
            high_corr_pairs.append((
                corr_matrix.columns[i],
                corr_matrix.columns[j],
                corr_matrix.iloc[i, j]
            ))

if high_corr_pairs:
    for feat1, feat2, corr_val in high_corr_pairs:
        print(f"  {feat1} <-> {feat2}: {corr_val:.4f}")
else:
    print("  ✓ No multicollinearity detected")

## 4. Data Quality Report

In [None]:
print("\n" + "="*80)
print("DATA QUALITY SCORECARD")
print("="*80)

total_records = len(df)
total_features = len(df.columns) - 1  # Excluding target

# Calculate quality score
missing_values = df.isnull().sum().sum()
duplicates = df.duplicated().sum()
domain_violations = sum(validation_results.values())

quality_scores = {
    'Completeness': 100 if missing_values == 0 else 
                  (1 - missing_values / (total_records * total_features)) * 100,
    'Uniqueness': 100 if duplicates == 0 else
                 (1 - duplicates / total_records) * 100,
    'Validity': 100 if domain_violations == 0 else 95,
}

overall_quality = np.mean(list(quality_scores.values()))

print(f"\nDataset Size: {total_records} records × {total_features + 1} features")
print(f"\nQuality Dimensions:")
for dimension, score in quality_scores.items():
    status = "✓" if score >= 95 else "⚠"
    print(f"  {status} {dimension:15s}: {score:6.2f}%")

print(f"\n{'='*40}")
print(f"  Overall Quality Score: {overall_quality:.2f}%")
print(f"{'='*40}")

# Quality grade
if overall_quality >= 95:
    grade = "EXCELLENT"
elif overall_quality >= 85:
    grade = "GOOD"
elif overall_quality >= 70:
    grade = "FAIR"
else:
    grade = "POOR"

print(f"\n  Data Quality Grade: {grade}")

## 5. Exploratory Data Analysis
### Univariate Analysis

In [None]:
print("\n" + "="*80)
print("UNIVARIATE ANALYSIS")
print("="*80)

numerical_features = ['amount', 'transaction_hour', 'device_trust_score', 
                     'velocity_last_24h', 'cardholder_age']

for feature in numerical_features:
    print(f"\n{feature.upper()}")
    print("-" * 40)
    
    # Descriptive statistics
    print(f"  Mean:       {df[feature].mean():.2f}")
    print(f"  Median:     {df[feature].median():.2f}")
    print(f"  Std Dev:    {df[feature].std():.2f}")
    print(f"  Range:      [{df[feature].min():.2f}, {df[feature].max():.2f}]")
    print(f"  IQR:        {df[feature].quantile(0.75) - df[feature].quantile(0.25):.2f}")
    
    # Distribution shape
    skew = df[feature].skew()
    kurt = df[feature].kurtosis()
    print(f"  Skewness:   {skew:.3f} {'(Right-skewed)' if skew > 0 else '(Left-skewed)' if skew < 0 else '(Symmetric)'}")
    print(f"  Kurtosis:   {kurt:.3f} {'(Heavy-tailed)' if kurt > 0 else '(Light-tailed)'}")

### Bivariate Analysis

In [None]:
print("\n" + "="*80)
print("BIVARIATE ANALYSIS (Features vs Fraud)")
print("="*80)

if df['is_fraud'].nunique() < 2:
    print("\n  ⚠ Cannot perform bivariate analysis - only one class present")
else:
    numerical_features = ['amount', 'transaction_hour', 'device_trust_score', 
                         'velocity_last_24h', 'cardholder_age']
    
    for feature in numerical_features:
        print(f"\n{feature.upper()} by Fraud Status:")
        print("-" * 40)
        
        fraud_stats = df.groupby('is_fraud')[feature].describe()
        print(fraud_stats)
        
        # Statistical test (Mann-Whitney U)
        normal_vals = df[df['is_fraud'] == 0][feature]
        fraud_vals = df[df['is_fraud'] == 1][feature]
        
        if len(fraud_vals) > 0:
            stat, p_value = mannwhitneyu(normal_vals, fraud_vals)
            print(f"\n  Mann-Whitney U test p-value: {p_value:.4f}")
            if p_value < 0.05:
                print(f"  ✓ Significant difference between groups")
            else:
                print(f"  ✗ No significant difference")

### Multivariate Pattern Analysis

In [None]:
print("\n" + "="*80)
print("MULTIVARIATE PATTERN ANALYSIS")
print("="*80)

# Amount by merchant and fraud
print("\nAverage Amount by Merchant Category:")
merchant_amount = df.groupby('merchant_category')['amount'].agg(['mean', 'std', 'count'])
print(merchant_amount.round(2))

# Risk factor combinations
print("\nRisk Factor Combinations:")
risk_analysis = df.groupby(['foreign_transaction', 'location_mismatch']).agg({
    'is_fraud': ['count', 'sum', 'mean'],
    'amount': 'mean',
    'device_trust_score': 'mean'
}).round(2)
print(risk_analysis)

### Feature Importance Analysis

In [None]:
print("\n" + "="*80)
print("FEATURE IMPORTANCE INDICATORS")
print("="*80)

numerical_features = ['amount', 'transaction_hour', 'device_trust_score', 
                     'velocity_last_24h', 'cardholder_age']

# Variance-based
print("\nFeature Variance (Higher = More Informative):")
for feature in numerical_features:
    var = df[feature].var()
    cv = df[feature].std() / df[feature].mean() if df[feature].mean() != 0 else 0
    print(f"  {feature:25s}: Variance={var:10.2f}, CV={cv:.4f}")

# Information value (if fraud cases exist)
if df['is_fraud'].sum() > 0:
    print("\nPreliminary Feature-Target Association:")
    for feature in numerical_features:
        corr = df[feature].corr(df['is_fraud'])
        print(f"  {feature:25s}: r={corr:7.4f}")

## 6. Executive Summary & Save Cleaned Data

In [None]:
print("\n" + "="*80)
print("EXECUTIVE SUMMARY")
print("="*80)
print(f"\n✓ Data Cleaning Completed")
print(f"✓ Exploratory Data Analysis Completed")
print(f"✓ Data Quality Score: {overall_quality:.2f}%")
print(f"✓ Ready for Feature Engineering & Modeling")
print("\n" + "="*80)

# Save cleaned data
df.to_csv(output_filepath, index=False)
print(f"\n✓ Cleaned data saved to: {output_filepath}")
print(f"✓ Total records saved: {len(df)}")

## 7. Quick Data Overview

In [None]:
# Display first few rows
print("\nFirst 5 rows of cleaned data:")
df.head()

In [None]:
# Display basic info
print("\nDataset Information:")
df.info()

In [None]:
# Display statistical summary
print("\nStatistical Summary:")
df.describe()