# Feature Engineering

## Overview
This notebook creates derived features for customer and product analysis.

## Features Created

### Transaction-Level Features
| Feature | Formula | Purpose |
|---------|---------|--------|
| TotalAmount | Quantity x UnitPrice | Order line value |
| PriceCategory | Quantile binning | Price tier analysis |
| IsCancellation | InvoiceNo starts with 'C' | Return identification |

### Time-Based Features
| Feature | Derivation | Purpose |
|---------|------------|--------|
| DayOfWeek | From InvoiceDate | Day patterns |
| Month | From InvoiceDate | Seasonal trends |
| Hour | From InvoiceDate | Time-of-day patterns |
| IsWeekend | Saturday/Sunday flag | Weekend vs weekday |

### Customer-Level Features (RFM-adjacent)
| Feature | Calculation | Purpose |
|---------|-------------|--------|
| CustomerLifetimeValue | Sum of TotalAmount per customer | Customer value ranking |
| AvgOrderValue | Mean TotalAmount per customer | Spending behavior |
| PurchaseFrequency | Count of unique invoices | Loyalty indicator |
| IsReturningCustomer | >1 purchase flag | Retention analysis |

### Product-Level Features
| Feature | Calculation | Purpose |
|---------|-------------|--------|
| ProductPopularity | Sum of quantity per product | Demand ranking |

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Helper Functions

In [None]:
def create_transaction_features(df):
    """
    Create transaction-level derived features.
    
    Args:
        df: DataFrame with Quantity, UnitPrice, InvoiceNo columns
    
    Returns:
        DataFrame with new features added
    """
    df = df.copy()
    
    # Total transaction value
    df['TotalAmount'] = (df['Quantity'] * df['UnitPrice']).round(2)
    
    # Price category using quantile binning
    # Only bin positive prices to avoid issues with free items
    df['PriceCategory'] = pd.qcut(
        df['UnitPrice'].clip(lower=0.01),  # Avoid zero prices in binning
        q=3, 
        labels=['Low', 'Medium', 'High']
    )
    
    # Identify cancellations/returns (InvoiceNo starts with 'C')
    df['IsCancellation'] = df['InvoiceNo'].astype(str).str.startswith('C').astype(int)
    
    return df


def create_time_features(df):
    """
    Create time-based features from InvoiceDate.
    
    Args:
        df: DataFrame with InvoiceDate column (datetime)
    
    Returns:
        DataFrame with time features added
    """
    df = df.copy()
    
    # Ensure datetime type
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
    
    # Extract time components
    df['DayOfWeek'] = df['InvoiceDate'].dt.day_name()
    df['Month'] = df['InvoiceDate'].dt.month_name()
    df['MonthNum'] = df['InvoiceDate'].dt.month  # For proper sorting
    df['Hour'] = df['InvoiceDate'].dt.hour
    df['YearMonth'] = df['InvoiceDate'].dt.to_period('M')  # For trend analysis
    
    # Weekend flag - using vectorized isin() instead of apply(lambda)
    # This is 10-100x faster than apply(lambda)
    df['IsWeekend'] = df['DayOfWeek'].isin(['Saturday', 'Sunday']).astype(int)
    
    return df


def create_customer_features(df):
    """
    Create customer-level aggregated features.
    
    Args:
        df: DataFrame with CustomerID, TotalAmount, InvoiceNo columns
    
    Returns:
        DataFrame with customer features merged in
    """
    df = df.copy()
    
    # Calculate all customer metrics in a single groupby operation
    # This is more efficient than multiple separate groupby + merge operations
    customer_metrics = df.groupby('CustomerID').agg(
        CustomerLifetimeValue=('TotalAmount', 'sum'),
        AvgOrderValue=('TotalAmount', 'mean'),
        PurchaseFrequency=('InvoiceNo', 'nunique'),
        TotalTransactions=('InvoiceNo', 'count'),
        FirstPurchase=('InvoiceDate', 'min'),
        LastPurchase=('InvoiceDate', 'max')
    ).round(2).reset_index()
    
    # Returning customer flag (more than 1 unique invoice)
    customer_metrics['IsReturningCustomer'] = (customer_metrics['PurchaseFrequency'] > 1).astype(int)
    
    # Merge back to transaction data - single merge instead of multiple
    df = df.merge(customer_metrics, on='CustomerID', how='left')
    
    return df


def create_product_features(df):
    """
    Create product-level features.
    
    Args:
        df: DataFrame with Description/StockCode and Quantity columns
    
    Returns:
        DataFrame with product features added
    """
    df = df.copy()
    
    # Product popularity based on total quantity sold
    # Using transform() to broadcast back to original DataFrame
    df['ProductPopularity'] = df.groupby('StockCode')['Quantity'].transform('sum')
    
    return df

## 1. Load Cleaned Data

In [None]:
df = pd.read_csv('clean_data.csv')

print(f"Loaded {len(df):,} rows")
print(f"Columns: {list(df.columns)}")
df.head()

## 2. Create Features

In [None]:
# Apply feature engineering in sequence
print("Creating transaction features...")
df = create_transaction_features(df)

print("Creating time features...")
df = create_time_features(df)

print("Creating product features...")
df = create_product_features(df)

print("Creating customer features...")
df = create_customer_features(df)

print(f"\nFeature engineering complete. Shape: {df.shape}")

In [None]:
# Review new columns
print("Columns after feature engineering:")
print(df.columns.tolist())

In [None]:
df.head()

## 3. Data Quality Check

Verify feature values make sense before proceeding.

In [None]:
# Summary statistics
print("=== Numeric Feature Summary ===")
numeric_cols = ['Quantity', 'UnitPrice', 'TotalAmount', 'ProductPopularity', 
                'CustomerLifetimeValue', 'AvgOrderValue', 'PurchaseFrequency']
df[numeric_cols].describe().round(2)

In [None]:
# Identify anomalies
anomalies = {
    'Negative Quantity': (df['Quantity'] < 0).sum(),
    'Negative TotalAmount': (df['TotalAmount'] < 0).sum(),
    'Zero UnitPrice': (df['UnitPrice'] == 0).sum(),
    'Negative CLV': (df['CustomerLifetimeValue'] < 0).sum(),
    'Negative ProductPopularity': (df['ProductPopularity'] < 0).sum(),
    'Cancellations': df['IsCancellation'].sum()
}

print("=== Anomaly Detection ===")
for name, count in anomalies.items():
    pct = count / len(df) * 100
    print(f"{name}: {count:,} ({pct:.2f}%)")

### Anomaly Explanation

| Anomaly | Cause | Treatment |
|---------|-------|----------|
| Negative Quantity | Returns/cancellations (Invoice starts with 'C') | Keep for analysis, filter for sales-only views |
| Negative TotalAmount | Result of negative quantity x price | Same as above |
| Zero UnitPrice | Free items, samples, or promotional giveaways | Keep - legitimate transactions |
| Negative CLV | Customer with net refunds > purchases | Flag for investigation |
| Negative ProductPopularity | Products with more returns than sales | Clip to 0 for popularity ranking |

In [None]:
# Investigate customers with negative CLV
negative_clv_customers = df[df['CustomerLifetimeValue'] < 0]['CustomerID'].unique()
print(f"Customers with negative CLV: {len(negative_clv_customers)}")
print(f"Customer IDs: {negative_clv_customers}")

# Show their transaction history
if len(negative_clv_customers) > 0:
    print("\nSample transactions for customer with negative CLV:")
    sample_cust = negative_clv_customers[0]
    print(df[df['CustomerID'] == sample_cust][['InvoiceNo', 'Description', 'Quantity', 'UnitPrice', 'TotalAmount']].head(10))

## 4. Create Analysis-Ready Dataset

For most analyses, we want to exclude:
- Cancellations/returns (negative quantities)
- Zero-price items (can't calculate meaningful averages)

We'll create a "clean" version while preserving the full dataset.

In [None]:
# Create filtered version for analysis (excluding returns and zero-price)
df_sales_only = df[
    (df['Quantity'] > 0) & 
    (df['TotalAmount'] > 0) & 
    (df['UnitPrice'] > 0)
].copy()

# Fix ProductPopularity - clip negative values to 0
df_sales_only['ProductPopularity'] = df_sales_only['ProductPopularity'].clip(lower=0)

print(f"Full dataset: {len(df):,} rows")
print(f"Sales only (no returns/free items): {len(df_sales_only):,} rows")
print(f"Removed: {len(df) - len(df_sales_only):,} rows ({(len(df) - len(df_sales_only))/len(df)*100:.1f}%)")

In [None]:
# Recalculate customer metrics on sales-only data for cleaner analysis
# (Original CLV includes returns which can distort analysis)
sales_customer_metrics = df_sales_only.groupby('CustomerID').agg(
    CLV_SalesOnly=('TotalAmount', 'sum'),
    AvgOrderValue_SalesOnly=('TotalAmount', 'mean')
).round(2).reset_index()

df_sales_only = df_sales_only.merge(sales_customer_metrics, on='CustomerID', how='left')

print("Added recalculated customer metrics (excluding returns):")
print("- CLV_SalesOnly")
print("- AvgOrderValue_SalesOnly")

In [None]:
# Verify no negative values in sales-only dataset
print("=== Sales-Only Dataset Validation ===")
print(f"Min Quantity: {df_sales_only['Quantity'].min()}")
print(f"Min TotalAmount: {df_sales_only['TotalAmount'].min()}")
print(f"Min CLV_SalesOnly: {df_sales_only['CLV_SalesOnly'].min()}")
print(f"Min ProductPopularity: {df_sales_only['ProductPopularity'].min()}")

## 5. Feature Overview

In [None]:
# Final column overview
print("=== Final Dataset Schema ===")
print(f"\nShape: {df_sales_only.shape}")
print(f"\nColumn types:")
print(df_sales_only.dtypes)

In [None]:
df_sales_only.head()

## 6. Export Engineered Data

In [None]:
# Export the SALES-ONLY dataset (cleaned, no returns)
# This fixes the original bug where the wrong dataframe was exported
df_sales_only.to_csv('data_final.csv', index=False)

print("Exported: data_final.csv")
print(f"Rows: {len(df_sales_only):,}")
print(f"Columns: {len(df_sales_only.columns)}")

print("\n=== Feature Engineering Summary ===")
print("Transaction features: TotalAmount, PriceCategory, IsCancellation")
print("Time features: DayOfWeek, Month, MonthNum, Hour, YearMonth, IsWeekend")
print("Customer features: CustomerLifetimeValue, AvgOrderValue, PurchaseFrequency, IsReturningCustomer")
print("Product features: ProductPopularity")
print("Clean metrics: CLV_SalesOnly, AvgOrderValue_SalesOnly")