# Feature Engineering and Data Transformation

This notebook performs feature engineering and data transformation for both e-commerce and credit card fraud datasets.

## Objectives
1. Feature Engineering for Fraud_Data.csv:
   - Transaction frequency and velocity features
   - Time-based features (hour_of_day, day_of_week, time_since_signup)
2. Data Transformation:
   - Normalize/scale numerical features
   - Encode categorical features
3. Handle Class Imbalance:
   - Apply SMOTE or undersampling to training data only
   - Document class distribution before and after resampling


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import ipaddress
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

warnings.filterwarnings('ignore')

# Set up paths - ensure we're working from the project root
project_root = Path().resolve()
if project_root.name == 'notebooks':
    project_root = project_root.parent
elif (project_root / 'notebooks').exists():
    pass  # Already at project root
else:
    # Try to find project root by looking for data directory
    current = Path().resolve()
    while current != current.parent:
        if (current / 'data').exists():
            project_root = current
            break
        current = current.parent

DATA_DIR = project_root / 'data' / 'raw'
PROCESSED_DIR = project_root / 'data' / 'processed'
OUTPUT_DIR = project_root / 'outputs' / 'eda' / 'feature-engineering'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)  # Ensure output directory exists
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)  # Ensure processed directory exists

# Print relative paths so output is the same for all team members
print(f"Project root: .")
print(f"Data directory: {DATA_DIR.relative_to(project_root)}")
print(f"Processed directory: {PROCESSED_DIR.relative_to(project_root)}")
print(f"Output directory: {OUTPUT_DIR.relative_to(project_root)}")
print(f"Data directory exists: {DATA_DIR.exists()}")

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Helper function to save plots (prints relative path for portability)
def save_plot(fig, filename, dpi=300, bbox_inches='tight'):
    """Save plot to output directory"""
    filepath = OUTPUT_DIR / filename
    fig.savefig(filepath, dpi=dpi, bbox_inches=bbox_inches)
    rel = filepath.relative_to(project_root)
    print(f"Plot saved to: {rel}")

print("Libraries imported successfully!")


## Part 1: E-commerce Fraud Data Feature Engineering


In [None]:
# Load the cleaned fraud data (assuming EDA notebook has been run)
# If running independently, load and clean data here
fraud_df = pd.read_csv(DATA_DIR / 'Fraud_Data.csv')

# Convert timestamps
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

# Load IP country mapping if needed
ip_country_df = pd.read_csv(DATA_DIR / 'IpAddress_to_Country.csv')

print(f"Fraud Data Shape: {fraud_df.shape}")
print(f"Columns: {fraud_df.columns.tolist()}")
fraud_df.head()


### 1.1 Time-based Features


In [None]:
# Extract hour of day from purchase_time
fraud_df['hour_of_day'] = fraud_df['purchase_time'].dt.hour

# Extract day of week (0=Monday, 6=Sunday)
fraud_df['day_of_week'] = fraud_df['purchase_time'].dt.dayofweek

# Calculate time since signup (in hours)
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds() / 3600

# Handle negative time_since_signup (purchase before signup - data error)
fraud_df['time_since_signup'] = fraud_df['time_since_signup'].clip(lower=0)

print("Time-based features created:")
print(fraud_df[['purchase_time', 'signup_time', 'hour_of_day', 'day_of_week', 'time_since_signup']].head(10))

# Summary statistics
print("\nTime-based Features Summary:")
print(fraud_df[['hour_of_day', 'day_of_week', 'time_since_signup']].describe())


In [None]:
# Visualize time-based features
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Hour of day distribution
hour_counts = fraud_df['hour_of_day'].value_counts().sort_index()
axes[0].bar(hour_counts.index, hour_counts.values, color='steelblue')
axes[0].set_title('Transactions by Hour of Day')
axes[0].set_xlabel('Hour of Day')
axes[0].set_ylabel('Count')
axes[0].set_xticks(range(0, 24, 2))

# Day of week distribution
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
day_counts = fraud_df['day_of_week'].value_counts().sort_index()
axes[1].bar(day_counts.index, day_counts.values, color='coral')
axes[1].set_title('Transactions by Day of Week')
axes[1].set_xlabel('Day of Week')
axes[1].set_ylabel('Count')
axes[1].set_xticks(range(7))
axes[1].set_xticklabels(day_names)

# Time since signup distribution
axes[2].hist(fraud_df['time_since_signup'], bins=50, edgecolor='black', alpha=0.7, color='green')
axes[2].set_title('Time Since Signup Distribution')
axes[2].set_xlabel('Hours Since Signup')
axes[2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


### 1.2 Transaction Frequency and Velocity Features


In [None]:
# Sort by user_id and purchase_time for proper calculation
fraud_df = fraud_df.sort_values(['user_id', 'purchase_time']).reset_index(drop=True)

# Calculate transaction frequency per user (total count)
user_transaction_count = fraud_df.groupby('user_id').size().reset_index(name='total_transactions')
fraud_df = fraud_df.merge(user_transaction_count, on='user_id', how='left')

# OPTIMIZED: Calculate transaction frequency in time windows using vectorized operations
# This uses merge_asof and groupby for much better performance
print("Calculating transaction frequency features (optimized version)...")

# Convert purchase_time to numeric for faster comparisons (seconds since epoch)
fraud_df['purchase_time_sec'] = (fraud_df['purchase_time'] - pd.Timestamp('1970-01-01')).dt.total_seconds()

# Initialize columns
fraud_df['transactions_last_24h'] = 0
fraud_df['transactions_last_7d'] = 0
fraud_df['transactions_last_30d'] = 0

# Use groupby and vectorized operations - much faster!
def calculate_window_counts(group):
    """Calculate rolling window counts for a user group"""
    times = group['purchase_time_sec'].values
    n = len(times)
    counts_24h = []
    counts_7d = []
    counts_30d = []
    
    # Vectorized calculation using numpy broadcasting
    for i in range(n):
        current_time = times[i]
        time_diffs = current_time - times[:i+1]  # Only look at past transactions
        
        counts_24h.append(np.sum(time_diffs <= 24*3600))  # 24 hours in seconds
        counts_7d.append(np.sum(time_diffs <= 7*24*3600))  # 7 days in seconds
        counts_30d.append(np.sum(time_diffs <= 30*24*3600))  # 30 days in seconds
    
    group['transactions_last_24h'] = counts_24h
    group['transactions_last_7d'] = counts_7d
    group['transactions_last_30d'] = counts_30d
    return group

# Apply to each user group
fraud_df = fraud_df.groupby('user_id', group_keys=False).apply(calculate_window_counts)

# Drop temporary column
fraud_df = fraud_df.drop('purchase_time_sec', axis=1)

print("✓ Transaction frequency features created!")
print(fraud_df[['user_id', 'purchase_time', 'total_transactions', 
                'transactions_last_24h', 'transactions_last_7d', 'transactions_last_30d']].head(10))


In [None]:
# Calculate transaction velocity (transactions per hour in last 24h)
fraud_df['velocity_last_24h'] = fraud_df['transactions_last_24h'] / 24.0

# Calculate average purchase value per user
user_avg_purchase = fraud_df.groupby('user_id')['purchase_value'].mean().reset_index(name='avg_purchase_value')
fraud_df = fraud_df.merge(user_avg_purchase, on='user_id', how='left')

# Calculate deviation from average purchase value
fraud_df['purchase_value_deviation'] = fraud_df['purchase_value'] - fraud_df['avg_purchase_value']

print("Velocity and deviation features created:")
print(fraud_df[['user_id', 'velocity_last_24h', 'avg_purchase_value', 'purchase_value_deviation']].head(10))

# Summary statistics
print("\nTransaction Frequency/Velocity Features Summary:")
print(fraud_df[['total_transactions', 'transactions_last_24h', 'transactions_last_7d', 
                'transactions_last_30d', 'velocity_last_24h']].describe())





### 1.3 Geolocation Features (if not done in EDA)


In [None]:
# Convert IP addresses to integer format
def ip_to_int(ip_str):
    """Convert IP address string to integer"""
    try:
        return int(ipaddress.IPv4Address(ip_str))
    except:
        return None

# Convert IP addresses
fraud_df['ip_address_int'] = fraud_df['ip_address'].apply(ip_to_int)
ip_country_df['lower_bound_int'] = ip_country_df['lower_bound_ip_address'].apply(ip_to_int)
ip_country_df['upper_bound_int'] = ip_country_df['upper_bound_ip_address'].apply(ip_to_int)

# Merge with country mapping
def find_country(ip_int, ip_country_df):
    """Find country for an IP address using range-based lookup"""
    if pd.isna(ip_int):
        return None
    mask = (ip_country_df['lower_bound_int'] <= ip_int) & (ip_country_df['upper_bound_int'] >= ip_int)
    matches = ip_country_df[mask]
    if len(matches) > 0:
        return matches.iloc[0]['country']
    return None

fraud_df['country'] = fraud_df['ip_address_int'].apply(lambda x: find_country(x, ip_country_df))

print(f"Matched countries for {fraud_df['country'].notna().sum()} IP addresses")
print(f"Unmatched: {fraud_df['country'].isna().sum()}")

# Create fraud rate by country feature (if country is available)
if fraud_df['country'].notna().sum() > 0:
    country_fraud_rate = fraud_df.groupby('country')['class'].mean().reset_index(name='country_fraud_rate')
    fraud_df = fraud_df.merge(country_fraud_rate, on='country', how='left')
    print("\nCountry fraud rate feature created")


### 1.4 Data Transformation for E-commerce Data


In [None]:
# Prepare features for modeling
# Select features to use
numerical_features = ['purchase_value', 'age', 'hour_of_day', 'day_of_week', 'time_since_signup',
                      'total_transactions', 'transactions_last_24h', 'transactions_last_7d', 
                      'transactions_last_30d', 'velocity_last_24h', 'avg_purchase_value', 
                      'purchase_value_deviation']

categorical_features = ['source', 'browser', 'sex']

# Add country if available
if 'country_fraud_rate' in fraud_df.columns:
    numerical_features.append('country_fraud_rate')

# Create a copy for transformation
fraud_df_processed = fraud_df.copy()

# Handle missing values in numerical features
for col in numerical_features:
    if fraud_df_processed[col].isnull().sum() > 0:
        fraud_df_processed[col].fillna(fraud_df_processed[col].median(), inplace=True)

# Handle missing values in categorical features
for col in categorical_features:
    if fraud_df_processed[col].isnull().sum() > 0:
        fraud_df_processed[col].fillna('Unknown', inplace=True)

print("Missing values handled")
print(fraud_df_processed[numerical_features + categorical_features].isnull().sum())


In [None]:
# Split into features and target
X_fraud = fraud_df_processed[numerical_features + categorical_features].copy()
y_fraud = fraud_df_processed['class'].copy()

# Split into train and test sets BEFORE any transformation (to avoid data leakage)
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

print(f"Training set shape: {X_train_fraud.shape}")
print(f"Test set shape: {X_test_fraud.shape}")
print(f"\nTraining class distribution:")
print(y_train_fraud.value_counts())
print(f"\nTest class distribution:")
print(y_test_fraud.value_counts())


In [None]:
# Scale numerical features (using StandardScaler)
scaler_fraud = StandardScaler()

# Fit scaler on training data only
X_train_fraud_scaled = X_train_fraud.copy()
X_train_fraud_scaled[numerical_features] = scaler_fraud.fit_transform(X_train_fraud[numerical_features])

# Transform test data using the fitted scaler
X_test_fraud_scaled = X_test_fraud.copy()
X_test_fraud_scaled[numerical_features] = scaler_fraud.transform(X_test_fraud[numerical_features])

print("Numerical features scaled")
print("\nScaled training data sample:")
print(X_train_fraud_scaled[numerical_features].head())


In [None]:
# Encode categorical features using One-Hot Encoding
encoder_fraud = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Fit encoder on training data
X_train_categorical_encoded = encoder_fraud.fit_transform(X_train_fraud[categorical_features])
X_test_categorical_encoded = encoder_fraud.transform(X_test_fraud[categorical_features])

# Get feature names
categorical_feature_names = encoder_fraud.get_feature_names_out(categorical_features)

# Create DataFrames
X_train_cat_df = pd.DataFrame(X_train_categorical_encoded, columns=categorical_feature_names, index=X_train_fraud.index)
X_test_cat_df = pd.DataFrame(X_test_categorical_encoded, columns=categorical_feature_names, index=X_test_fraud.index)

# Combine numerical and categorical features
X_train_fraud_final = pd.concat([X_train_fraud_scaled[numerical_features], X_train_cat_df], axis=1)
X_test_fraud_final = pd.concat([X_test_fraud_scaled[numerical_features], X_test_cat_df], axis=1)

print("Categorical features encoded")
print(f"\nFinal feature count: {X_train_fraud_final.shape[1]}")
print(f"\nFeature names: {X_train_fraud_final.columns.tolist()}")
print("\nFinal training data sample:")
print(X_train_fraud_final.head())


### 1.5 Handle Class Imbalance for E-commerce Data


In [None]:
# Document class distribution BEFORE resampling
print("Class Distribution BEFORE Resampling:")
print(f"Training set:")
print(f"  Non-fraudulent (0): {(y_train_fraud == 0).sum():,} ({(y_train_fraud == 0).mean()*100:.2f}%)")
print(f"  Fraudulent (1): {(y_train_fraud == 1).sum():,} ({(y_train_fraud == 1).mean()*100:.2f}%)")
print(f"  Imbalance Ratio: {(y_train_fraud == 0).sum() / (y_train_fraud == 1).sum():.2f}:1")

# Visualize before resampling
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

before_counts = y_train_fraud.value_counts()
axes[0].bar(['Non-Fraudulent (0)', 'Fraudulent (1)'], before_counts.values, color=['skyblue', 'coral'])
axes[0].set_title('Class Distribution BEFORE Resampling')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

axes[0].text(0, before_counts[0], f'{before_counts[0]:,}', ha='center', va='bottom')
axes[0].text(1, before_counts[1], f'{before_counts[1]:,}', ha='center', va='bottom')

plt.tight_layout()
save_plot(fig, 'fraud_smote_before_after_comparison.png')
plt.show()


In [None]:
# Apply SMOTE to balance the classes
# SMOTE creates synthetic samples of the minority class
print("\nApplying SMOTE...")

smote_fraud = SMOTE(random_state=42, sampling_strategy=0.5)  # Balance to 50% minority class
X_train_fraud_balanced, y_train_fraud_balanced = smote_fraud.fit_resample(X_train_fraud_final, y_train_fraud)

print(f"\nClass Distribution AFTER SMOTE:")
print(f"Training set:")
print(f"  Non-fraudulent (0): {(y_train_fraud_balanced == 0).sum():,} ({(y_train_fraud_balanced == 0).mean()*100:.2f}%)")
print(f"  Fraudulent (1): {(y_train_fraud_balanced == 1).sum():,} ({(y_train_fraud_balanced == 1).mean()*100:.2f}%)")
print(f"  New Ratio: {(y_train_fraud_balanced == 0).sum() / (y_train_fraud_balanced == 1).sum():.2f}:1")

# Visualize after resampling
after_counts = pd.Series(y_train_fraud_balanced).value_counts()
axes[1].bar(['Non-Fraudulent (0)', 'Fraudulent (1)'], after_counts.values, color=['skyblue', 'coral'])
axes[1].set_title('Class Distribution AFTER SMOTE')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

axes[1].text(0, after_counts[0], f'{after_counts[0]:,}', ha='center', va='bottom')
axes[1].text(1, after_counts[1], f'{after_counts[1]:,}', ha='center', va='bottom')

plt.tight_layout()
save_plot(fig, 'fraud_smote_before_after_comparison.png')
plt.show()

# Create a separate post-SMOTE only visualization
fig_post, ax_post = plt.subplots(figsize=(8, 5))
after_counts = pd.Series(y_train_fraud_balanced).value_counts()
ax_post.bar(['Non-Fraudulent (0)', 'Fraudulent (1)'], after_counts.values, color=['skyblue', 'coral'])
ax_post.set_title('Class Distribution AFTER SMOTE (E-commerce Data)')
ax_post.set_ylabel('Count')
ax_post.tick_params(axis='x', rotation=45)
ax_post.text(0, after_counts[0], f'{after_counts[0]:,}', ha='center', va='bottom')
ax_post.text(1, after_counts[1], f'{after_counts[1]:,}', ha='center', va='bottom')
plt.tight_layout()
save_plot(fig_post, 'fraud_post_smote_distribution.png')
plt.show()

print(f"\nTraining set shape after SMOTE: {X_train_fraud_balanced.shape}")


In [None]:
# Quick check: If SMOTE is taking too long, you can generate the visualization separately
# This cell can be run independently after SMOTE completes

# Check if balanced data exists (from previous run)
try:
    # Try to load already processed data if SMOTE was run before
    if 'y_train_fraud_balanced' in locals():
        print("Using existing balanced data from current session")
        after_counts = pd.Series(y_train_fraud_balanced).value_counts()
        
        # Create post-SMOTE visualization
        fig_post, ax_post = plt.subplots(figsize=(8, 5))
        ax_post.bar(['Non-Fraudulent (0)', 'Fraudulent (1)'], after_counts.values, color=['skyblue', 'coral'])
        ax_post.set_title('Class Distribution AFTER SMOTE (E-commerce Data)')
        ax_post.set_ylabel('Count')
        ax_post.tick_params(axis='x', rotation=45)
        ax_post.text(0, after_counts[0], f'{after_counts[0]:,}', ha='center', va='bottom')
        ax_post.text(1, after_counts[1], f'{after_counts[1]:,}', ha='center', va='bottom')
        plt.tight_layout()
        save_plot(fig_post, 'fraud_post_smote_distribution.png')
        plt.show()
        print("✓ Post-SMOTE visualization saved!")
    else:
        print("⚠ Balanced data not found. Please run the SMOTE cell first (Cell 19).")
except Exception as e:
    print(f"Error: {e}")
    print("Please ensure SMOTE has completed running.")


In [None]:
# Save processed e-commerce data
X_train_fraud_balanced_df = pd.DataFrame(X_train_fraud_balanced, columns=X_train_fraud_final.columns)
X_test_fraud_final_df = X_test_fraud_final.copy()

# Save to processed folder
X_train_fraud_balanced_df.to_csv(PROCESSED_DIR / 'fraud_train_features.csv', index=False)
y_train_fraud_balanced_df = pd.DataFrame(y_train_fraud_balanced, columns=['class'])
y_train_fraud_balanced_df.to_csv(PROCESSED_DIR / 'fraud_train_target.csv', index=False)

X_test_fraud_final_df.to_csv(PROCESSED_DIR / 'fraud_test_features.csv', index=False)
y_test_fraud_df = pd.DataFrame(y_test_fraud, columns=['class'])
y_test_fraud_df.to_csv(PROCESSED_DIR / 'fraud_test_target.csv', index=False)

print("E-commerce fraud data saved to processed folder")


## Part 2: Credit Card Fraud Data Feature Engineering


In [None]:
# Load credit card data
cc_df = pd.read_csv(DATA_DIR / 'creditcard.csv')

print(f"Credit Card Data Shape: {cc_df.shape}")
print(f"Columns: {cc_df.columns.tolist()}")
cc_df.head()


### 2.1 Data Transformation for Credit Card Data


In [None]:
# Credit card data already has PCA features, so we mainly need to scale
# Separate features and target
X_cc = cc_df.drop('Class', axis=1)
y_cc = cc_df['Class']

# Split into train and test sets
X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(
    X_cc, y_cc, test_size=0.2, random_state=42, stratify=y_cc
)

print(f"Training set shape: {X_train_cc.shape}")
print(f"Test set shape: {X_test_cc.shape}")
print(f"\nTraining class distribution:")
print(y_train_cc.value_counts())
print(f"\nTest class distribution:")
print(y_test_cc.value_counts())


In [None]:
# Scale features (PCA features are already normalized, but Time and Amount need scaling)
scaler_cc = StandardScaler()

# Fit and transform training data
X_train_cc_scaled = pd.DataFrame(
    scaler_cc.fit_transform(X_train_cc),
    columns=X_train_cc.columns,
    index=X_train_cc.index
)

# Transform test data
X_test_cc_scaled = pd.DataFrame(
    scaler_cc.transform(X_test_cc),
    columns=X_test_cc.columns,
    index=X_test_cc.index
)

print("Credit card features scaled")
print("\nScaled training data sample:")
print(X_train_cc_scaled.head())


### 2.2 Handle Class Imbalance for Credit Card Data


In [None]:
# Document class distribution BEFORE resampling
print("Class Distribution BEFORE Resampling:")
print(f"Training set:")
print(f"  Non-fraudulent (0): {(y_train_cc == 0).sum():,} ({(y_train_cc == 0).mean()*100:.4f}%)")
print(f"  Fraudulent (1): {(y_train_cc == 1).sum():,} ({(y_train_cc == 1).mean()*100:.4f}%)")
print(f"  Imbalance Ratio: {(y_train_cc == 0).sum() / (y_train_cc == 1).sum():.2f}:1")

# Visualize before resampling
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

before_counts_cc = y_train_cc.value_counts()
axes[0].bar(['Non-Fraudulent (0)', 'Fraudulent (1)'], before_counts_cc.values, color=['skyblue', 'coral'])
axes[0].set_title('Class Distribution BEFORE Resampling')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

axes[0].text(0, before_counts_cc[0], f'{before_counts_cc[0]:,}', ha='center', va='bottom')
axes[0].text(1, before_counts_cc[1], f'{before_counts_cc[1]:,}', ha='center', va='bottom')

plt.tight_layout()
plt.show()


In [None]:
# Quick check: If SMOTE is taking too long, you can generate the visualization separately
# This cell can be run independently after SMOTE completes

# Check if balanced data exists (from previous run)
try:
    # Try to load already processed data if SMOTE was run before
    if 'y_train_cc_balanced' in locals():
        print("Using existing balanced data from current session")
        after_counts_cc = pd.Series(y_train_cc_balanced).value_counts()
        
        # Create post-SMOTE visualization
        fig_post_cc, ax_post_cc = plt.subplots(figsize=(8, 5))
        ax_post_cc.bar(['Non-Fraudulent (0)', 'Fraudulent (1)'], after_counts_cc.values, color=['skyblue', 'coral'])
        ax_post_cc.set_title('Class Distribution AFTER SMOTE (Credit Card Data)')
        ax_post_cc.set_ylabel('Count')
        ax_post_cc.tick_params(axis='x', rotation=45)
        ax_post_cc.text(0, after_counts_cc[0], f'{after_counts_cc[0]:,}', ha='center', va='bottom')
        ax_post_cc.text(1, after_counts_cc[1], f'{after_counts_cc[1]:,}', ha='center', va='bottom')
        plt.tight_layout()
        save_plot(fig_post_cc, 'creditcard_post_smote_distribution.png')
        plt.show()
        print("✓ Post-SMOTE visualization saved!")
    else:
        print("⚠ Balanced data not found. Please run the SMOTE cell first (Cell 28).")
except Exception as e:
    print(f"Error: {e}")
    print("Please ensure SMOTE has completed running.")


In [None]:
# Apply SMOTE for credit card data
# Note: For extremely imbalanced data, we might use a lower sampling_strategy
print("\nApplying SMOTE...")

smote_cc = SMOTE(random_state=42, sampling_strategy=0.1)  # Balance to 10% minority class (less aggressive)
X_train_cc_balanced, y_train_cc_balanced = smote_cc.fit_resample(X_train_cc_scaled, y_train_cc)

print(f"\nClass Distribution AFTER SMOTE:")
print(f"Training set:")
print(f"  Non-fraudulent (0): {(y_train_cc_balanced == 0).sum():,} ({(y_train_cc_balanced == 0).mean()*100:.2f}%)")
print(f"  Fraudulent (1): {(y_train_cc_balanced == 1).sum():,} ({(y_train_cc_balanced == 1).mean()*100:.2f}%)")
print(f"  New Ratio: {(y_train_cc_balanced == 0).sum() / (y_train_cc_balanced == 1).sum():.2f}:1")

# Visualize after resampling
after_counts_cc = pd.Series(y_train_cc_balanced).value_counts()
axes[1].bar(['Non-Fraudulent (0)', 'Fraudulent (1)'], after_counts_cc.values, color=['skyblue', 'coral'])
axes[1].set_title('Class Distribution AFTER SMOTE')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

axes[1].text(0, after_counts_cc[0], f'{after_counts_cc[0]:,}', ha='center', va='bottom')
axes[1].text(1, after_counts_cc[1], f'{after_counts_cc[1]:,}', ha='center', va='bottom')

plt.tight_layout()
save_plot(fig, 'creditcard_smote_before_after_comparison.png')
plt.show()

# Create a separate post-SMOTE only visualization
fig_post_cc, ax_post_cc = plt.subplots(figsize=(8, 5))
after_counts_cc = pd.Series(y_train_cc_balanced).value_counts()
ax_post_cc.bar(['Non-Fraudulent (0)', 'Fraudulent (1)'], after_counts_cc.values, color=['skyblue', 'coral'])
ax_post_cc.set_title('Class Distribution AFTER SMOTE (Credit Card Data)')
ax_post_cc.set_ylabel('Count')
ax_post_cc.tick_params(axis='x', rotation=45)
ax_post_cc.text(0, after_counts_cc[0], f'{after_counts_cc[0]:,}', ha='center', va='bottom')
ax_post_cc.text(1, after_counts_cc[1], f'{after_counts_cc[1]:,}', ha='center', va='bottom')
plt.tight_layout()
save_plot(fig_post_cc, 'creditcard_post_smote_distribution.png')
plt.show()

print(f"\nTraining set shape after SMOTE: {X_train_cc_balanced.shape}")


In [None]:
# Save processed credit card data
X_train_cc_balanced_df = pd.DataFrame(X_train_cc_balanced, columns=X_train_cc_scaled.columns)
X_test_cc_scaled_df = X_test_cc_scaled.copy()

# Save to processed folder
X_train_cc_balanced_df.to_csv(PROCESSED_DIR / 'creditcard_train_features.csv', index=False)
y_train_cc_balanced_df = pd.DataFrame(y_train_cc_balanced, columns=['Class'])
y_train_cc_balanced_df.to_csv(PROCESSED_DIR / 'creditcard_train_target.csv', index=False)

X_test_cc_scaled_df.to_csv(PROCESSED_DIR / 'creditcard_test_features.csv', index=False)
y_test_cc_df = pd.DataFrame(y_test_cc, columns=['Class'])
y_test_cc_df.to_csv(PROCESSED_DIR / 'creditcard_test_target.csv', index=False)

print("Credit card data saved to processed folder")


## Summary

### E-commerce Fraud Data:
1. **Features Created:**
   - Time-based: hour_of_day, day_of_week, time_since_signup
   - Transaction frequency: total_transactions, transactions_last_24h/7d/30d
   - Transaction velocity: velocity_last_24h
   - Purchase patterns: avg_purchase_value, purchase_value_deviation
   - Geolocation: country, country_fraud_rate

2. **Data Transformation:**
   - Numerical features: StandardScaler
   - Categorical features: One-Hot Encoding

3. **Class Imbalance Handling:**
   - Method: SMOTE (sampling_strategy=0.5)
   - Before: [Document ratio]
   - After: [Document ratio]

### Credit Card Fraud Data:
1. **Data Transformation:**
   - All features scaled using StandardScaler

2. **Class Imbalance Handling:**
   - Method: SMOTE (sampling_strategy=0.1)
   - Before: [Document ratio]
   - After: [Document ratio]

### Next Steps:
- Model training and evaluation
- Model selection and comparison
- Model explainability analysis
