In [1]:
# ============================================
# CELL 1: Setup and Data Loading
# ============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("🛠️ FEATURE ENGINEERING FOR CHURN PREDICTION")
print("=" * 60)

# Load the dataset
df = pd.read_csv('../data/raw/telco_customer_churn.csv')

print(f"📊 Original Dataset: {df.shape}")
print(f"👥 Customers: {df.shape[0]:,}")
print(f"📋 Features: {df.shape[1]}")

# Create a copy for feature engineering
df_features = df.copy()

print(f"\n✅ Data loaded! Starting feature engineering... 🚀")


🛠️ FEATURE ENGINEERING FOR CHURN PREDICTION
📊 Original Dataset: (7043, 21)
👥 Customers: 7,043
📋 Features: 21

✅ Data loaded! Starting feature engineering... 🚀


In [2]:
# ============================================
# CELL 2: Data Quality Check and Cleaning
# ============================================

print("🔍 DATA QUALITY CHECK & CLEANING")
print("=" * 45)

# Check data types
print("📋 Current Data Types:")
print(df_features.dtypes)

# Check for missing values
missing_values = df_features.isnull().sum()
print(f"\n🔍 Missing Values:")
if missing_values.sum() == 0:
    print("✅ No missing values!")
else:
    for col in missing_values[missing_values > 0].index:
        print(f"   {col}: {missing_values[col]}")

# Check for data inconsistencies
print(f"\n🔍 Data Inconsistency Check:")

# TotalCharges should be numeric
if df_features['TotalCharges'].dtype == 'object':
    print("⚠️ TotalCharges is object type - needs conversion")
    
    # Check for non-numeric values
    non_numeric = df_features[df_features['TotalCharges'].str.contains(r'[^0-9.]', na=False)]
    print(f"   Non-numeric TotalCharges entries: {len(non_numeric)}")
    
    # Convert to numeric, replacing errors with NaN
    df_features['TotalCharges'] = pd.to_numeric(df_features['TotalCharges'], errors='coerce')
    
    # Fill NaN values (likely new customers with 0 total charges)
    df_features['TotalCharges'].fillna(0, inplace=True)
    print("✅ TotalCharges converted to numeric")

# Check for any remaining missing values
missing_after = df_features.isnull().sum().sum()
print(f"\n📊 Missing values after cleaning: {missing_after}")

# Basic statistics
print(f"\n📊 Dataset after cleaning: {df_features.shape}")


🔍 DATA QUALITY CHECK & CLEANING
📋 Current Data Types:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

🔍 Missing Values:
✅ No missing values!

🔍 Data Inconsistency Check:
⚠️ TotalCharges is object type - needs conversion
   Non-numeric TotalCharges entries: 11
✅ TotalCharges converted to numeric

📊 Missing values after cleaning: 0

📊 Dataset after cleaning: (7043, 21)


In [3]:
# ============================================
# CELL 3: Categorical Feature Engineering
# ============================================

print("📊 CATEGORICAL FEATURE ENGINEERING")
print("=" * 45)

# Identify categorical columns
categorical_cols = df_features.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('customerID')  # Remove ID
if 'Churn' in categorical_cols:
    categorical_cols.remove('Churn')  # Remove target

print(f"📋 Categorical columns to engineer ({len(categorical_cols)}):")
for col in categorical_cols:
    unique_vals = df_features[col].nunique()
    print(f"   • {col}: {unique_vals} categories")

# 1. Binary Categorical Features (Yes/No)
binary_cols = []
for col in categorical_cols:
    unique_vals = df_features[col].unique()
    if len(unique_vals) == 2 and set(unique_vals) <= {'Yes', 'No'}:
        binary_cols.append(col)

print(f"\n🔄 Converting binary columns to 0/1:")
for col in binary_cols:
    df_features[f'{col}_encoded'] = df_features[col].map({'No': 0, 'Yes': 1})
    print(f"   ✅ {col} → {col}_encoded")

# 2. Multi-category features with domain knowledge
print(f"\n🎯 Engineering multi-category features:")

# Internet Service - create dummy variables and risk score
if 'InternetService' in df_features.columns:
    # Create dummy variables
    internet_dummies = pd.get_dummies(df_features['InternetService'], prefix='Internet')
    df_features = pd.concat([df_features, internet_dummies], axis=1)
    
    # Create internet service risk score based on churn analysis
    internet_risk = {'No': 0, 'DSL': 1, 'Fiber optic': 2}
    df_features['InternetService_risk'] = df_features['InternetService'].map(internet_risk)
    print(f"   ✅ InternetService → dummies + risk score")

# Contract - create dummy variables and stability score
if 'Contract' in df_features.columns:
    contract_dummies = pd.get_dummies(df_features['Contract'], prefix='Contract')
    df_features = pd.concat([df_features, contract_dummies], axis=1)
    
    # Contract stability score (longer = more stable)
    contract_stability = {'Month-to-month': 1, 'One year': 2, 'Two year': 3}
    df_features['Contract_stability'] = df_features['Contract'].map(contract_stability)
    print(f"   ✅ Contract → dummies + stability score")

# Payment Method - create dummy variables and risk score
if 'PaymentMethod' in df_features.columns:
    payment_dummies = pd.get_dummies(df_features['PaymentMethod'], prefix='Payment')
    df_features = pd.concat([df_features, payment_dummies], axis=1)
    
    # Payment method risk (electronic check is typically highest risk)
    payment_risk = {
        'Credit card (automatic)': 1,
        'Bank transfer (automatic)': 1, 
        'Mailed check': 2,
        'Electronic check': 3
    }
    df_features['PaymentMethod_risk'] = df_features['PaymentMethod'].map(payment_risk)
    print(f"   ✅ PaymentMethod → dummies + risk score")

# 3. Handle service columns with "No internet service" or "No phone service"
service_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines']

print(f"\n🔧 Processing service columns:")
for col in service_cols:
    if col in df_features.columns:
        # Create binary: has service (1) vs doesn't have service (0)
        df_features[f'{col}_binary'] = df_features[col].apply(
            lambda x: 1 if x == 'Yes' else 0
        )
        
        # Create availability: service available (1) vs not available (0)
        df_features[f'{col}_available'] = df_features[col].apply(
            lambda x: 0 if 'No internet service' in str(x) or 'No phone service' in str(x) else 1
        )
        print(f"   ✅ {col} → binary + availability features")

print(f"\n📊 Shape after categorical encoding: {df_features.shape}")


📊 CATEGORICAL FEATURE ENGINEERING
📋 Categorical columns to engineer (15):
   • gender: 2 categories
   • Partner: 2 categories
   • Dependents: 2 categories
   • PhoneService: 2 categories
   • MultipleLines: 3 categories
   • InternetService: 3 categories
   • OnlineSecurity: 3 categories
   • OnlineBackup: 3 categories
   • DeviceProtection: 3 categories
   • TechSupport: 3 categories
   • StreamingTV: 3 categories
   • StreamingMovies: 3 categories
   • Contract: 3 categories
   • PaperlessBilling: 2 categories
   • PaymentMethod: 4 categories

🔄 Converting binary columns to 0/1:
   ✅ Partner → Partner_encoded
   ✅ Dependents → Dependents_encoded
   ✅ PhoneService → PhoneService_encoded
   ✅ PaperlessBilling → PaperlessBilling_encoded

🎯 Engineering multi-category features:
   ✅ InternetService → dummies + risk score
   ✅ Contract → dummies + stability score
   ✅ PaymentMethod → dummies + risk score

🔧 Processing service columns:
   ✅ OnlineSecurity → binary + availability features


In [4]:
# ============================================
# CELL 4: Numerical Feature Engineering
# ============================================

print("🔢 NUMERICAL FEATURE ENGINEERING")
print("=" * 40)

# Current numerical columns
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 'SeniorCitizen']

print(f"📋 Base numerical features: {numerical_cols}")

# 1. Tenure-based features
print(f"\n⏰ Engineering tenure-based features:")

# Tenure categories
def categorize_tenure(tenure):
    if tenure <= 12:
        return 'New'
    elif tenure <= 24:
        return 'Medium'
    elif tenure <= 48:
        return 'Long'
    else:
        return 'Loyal'

df_features['tenure_category'] = df_features['tenure'].apply(categorize_tenure)
tenure_dummies = pd.get_dummies(df_features['tenure_category'], prefix='Tenure')
df_features = pd.concat([df_features, tenure_dummies], axis=1)

# Tenure in years
df_features['tenure_years'] = df_features['tenure'] / 12

# Is new customer (high churn risk)
df_features['is_new_customer'] = (df_features['tenure'] <= 6).astype(int)

print(f"   ✅ Tenure categories, years, new customer flag created")

# 2. Charges-based features
print(f"\n💰 Engineering charges-based features:")

# Average monthly charge (total/tenure, but handle tenure=0)
df_features['avg_monthly_charge'] = np.where(
    df_features['tenure'] > 0,
    df_features['TotalCharges'] / df_features['tenure'],
    df_features['MonthlyCharges']
)

# Charge categories
def categorize_charges(charges):
    if charges <= 35:
        return 'Low'
    elif charges <= 65:
        return 'Medium'
    else:
        return 'High'

df_features['charges_category'] = df_features['MonthlyCharges'].apply(categorize_charges)
charges_dummies = pd.get_dummies(df_features['charges_category'], prefix='Charges')
df_features = pd.concat([df_features, charges_dummies], axis=1)

# Price sensitivity indicators
df_features['high_monthly_charges'] = (df_features['MonthlyCharges'] > df_features['MonthlyCharges'].quantile(0.75)).astype(int)
df_features['low_monthly_charges'] = (df_features['MonthlyCharges'] < df_features['MonthlyCharges'].quantile(0.25)).astype(int)

print(f"   ✅ Average charges, charge categories, price sensitivity features created")

# 3. Service count features
print(f"\n📱 Engineering service count features:")

# Count of services
service_binary_cols = [col for col in df_features.columns if col.endswith('_binary')]
df_features['total_services'] = df_features[service_binary_cols].sum(axis=1)

# Service adoption rate (services used / services available)
service_available_cols = [col for col in df_features.columns if col.endswith('_available')]
df_features['services_available'] = df_features[service_available_cols].sum(axis=1)
df_features['service_adoption_rate'] = np.where(
    df_features['services_available'] > 0,
    df_features['total_services'] / df_features['services_available'],
    0
)

# Premium services (streaming)
premium_services = ['StreamingTV_binary', 'StreamingMovies_binary']
if all(col in df_features.columns for col in premium_services):
    df_features['premium_services'] = df_features[premium_services].sum(axis=1)
    df_features['has_premium'] = (df_features['premium_services'] > 0).astype(int)

# Protection services
protection_services = ['OnlineSecurity_binary', 'OnlineBackup_binary', 'DeviceProtection_binary', 'TechSupport_binary']
if all(col in df_features.columns for col in protection_services):
    df_features['protection_services'] = df_features[protection_services].sum(axis=1)
    df_features['has_protection'] = (df_features['protection_services'] > 0).astype(int)

print(f"   ✅ Service counts, adoption rates, premium/protection indicators created")

print(f"\n📊 Shape after numerical engineering: {df_features.shape}")


🔢 NUMERICAL FEATURE ENGINEERING
📋 Base numerical features: ['tenure', 'MonthlyCharges', 'TotalCharges', 'SeniorCitizen']

⏰ Engineering tenure-based features:
   ✅ Tenure categories, years, new customer flag created

💰 Engineering charges-based features:
   ✅ Average charges, charge categories, price sensitivity features created

📱 Engineering service count features:
   ✅ Service counts, adoption rates, premium/protection indicators created

📊 Shape after numerical engineering: (7043, 73)


In [5]:
# ============================================
# CELL 5: Advanced Feature Engineering
# ============================================

print("🎯 ADVANCED FEATURE ENGINEERING")
print("=" * 40)

# 1. Risk score combinations
print(f"\n⚠️ Creating composite risk scores:")

# Overall risk score (combination of known risk factors)
risk_factors = []
if 'Contract_stability' in df_features.columns:
    risk_factors.append('Contract_stability')
if 'PaymentMethod_risk' in df_features.columns:
    risk_factors.append('PaymentMethod_risk')
if 'InternetService_risk' in df_features.columns:
    risk_factors.append('InternetService_risk')

if risk_factors:
    # Normalize risk factors to 0-1 scale for combination
    for factor in risk_factors:
        max_val = df_features[factor].max()
        min_val = df_features[factor].min()
        df_features[f'{factor}_normalized'] = (df_features[factor] - min_val) / (max_val - min_val)
    
    # Composite risk score
    normalized_factors = [f'{factor}_normalized' for factor in risk_factors]
    df_features['composite_risk_score'] = df_features[normalized_factors].mean(axis=1)
    
print(f"   ✅ Composite risk score created from {len(risk_factors)} factors")

# 2. Customer lifecycle features
print(f"\n🔄 Creating customer lifecycle features:")

# Customer lifetime value estimate (tenure * monthly charges)
df_features['estimated_clv'] = df_features['tenure'] * df_features['MonthlyCharges']

# Revenue per month of tenure
df_features['revenue_per_tenure_month'] = df_features['TotalCharges'] / np.maximum(df_features['tenure'], 1)

# Spending trajectory (current vs average)
df_features['spending_above_avg'] = (df_features['MonthlyCharges'] > df_features['avg_monthly_charge']).astype(int)

print(f"   ✅ CLV, revenue metrics, spending patterns created")

# 3. Demographic and behavioral combinations
print(f"\n👥 Creating demographic interaction features:")

# Senior citizen + contract combination
if 'SeniorCitizen' in df_features.columns and 'Contract_stability' in df_features.columns:
    df_features['senior_short_contract'] = ((df_features['SeniorCitizen'] == 1) & 
                                           (df_features['Contract_stability'] == 1)).astype(int)

# Family indicators
family_cols = ['Partner_encoded', 'Dependents_encoded'] if all(col in df_features.columns for col in ['Partner_encoded', 'Dependents_encoded']) else []
if family_cols:
    df_features['family_size'] = df_features[family_cols].sum(axis=1)
    df_features['has_family'] = (df_features['family_size'] > 0).astype(int)

# Phone + Internet service combination
if all(col in df_features.columns for col in ['PhoneService_encoded', 'Internet_DSL', 'Internet_Fiber optic']):
    df_features['full_service_customer'] = ((df_features['PhoneService_encoded'] == 1) & 
                                           ((df_features['Internet_DSL'] == 1) | 
                                            (df_features['Internet_Fiber optic'] == 1))).astype(int)

print(f"   ✅ Demographic interactions created")

print(f"\n📊 Final shape after advanced engineering: {df_features.shape}")


🎯 ADVANCED FEATURE ENGINEERING

⚠️ Creating composite risk scores:
   ✅ Composite risk score created from 3 factors

🔄 Creating customer lifecycle features:
   ✅ CLV, revenue metrics, spending patterns created

👥 Creating demographic interaction features:
   ✅ Demographic interactions created

📊 Final shape after advanced engineering: (7043, 84)


In [6]:
# ============================================
# CELL 6: Feature Selection and Preparation
# ============================================

print("🎯 FEATURE SELECTION & PREPARATION")
print("=" * 45)

# Prepare target variable
target_col = 'Churn'
y = df_features[target_col].map({'No': 0, 'Yes': 1})

print(f"🎯 Target variable: {target_col}")
print(f"   Class distribution: {y.value_counts().to_dict()}")

# Remove original categorical columns and keep engineered features
columns_to_remove = [
    'customerID', 'Churn',  # ID and target
    'gender', 'Partner', 'Dependents',  # Original categorical (we have encoded versions)
    'PhoneService', 'MultipleLines', 'InternetService',
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
    'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
    'tenure_category', 'charges_category'  # Category columns (we have dummies)
]

# Keep only columns that exist
columns_to_remove = [col for col in columns_to_remove if col in df_features.columns]

# Create feature matrix
X = df_features.drop(columns=columns_to_remove)

print(f"\n📊 Feature Matrix:")
print(f"   Shape: {X.shape}")
print(f"   Features: {X.shape[1]}")

# Show feature importance preview
print(f"\n📋 Engineered Features ({X.shape[1]} total):")
feature_groups = {
    'Original Numerical': ['tenure', 'MonthlyCharges', 'TotalCharges', 'SeniorCitizen'],
    'Binary Encoded': [col for col in X.columns if col.endswith('_encoded')],
    'Service Features': [col for col in X.columns if '_binary' in col or '_available' in col],
    'Risk Scores': [col for col in X.columns if 'risk' in col or 'stability' in col],
    'Advanced Features': [col for col in X.columns if any(keyword in col for keyword in 
                         ['clv', 'adoption', 'composite', 'revenue', 'family', 'premium'])]
}

for group, features in feature_groups.items():
    existing_features = [f for f in features if f in X.columns]
    if existing_features:
        print(f"\n{group} ({len(existing_features)}):")
        for feature in existing_features[:5]:  # Show first 5
            print(f"   • {feature}")
        if len(existing_features) > 5:
            print(f"   ... and {len(existing_features) - 5} more")

# Check for any missing values in feature matrix
missing_in_features = X.isnull().sum().sum()
print(f"\n🔍 Missing values in feature matrix: {missing_in_features}")

if missing_in_features > 0:
    print("⚠️ Handling remaining missing values...")
    # Fill with median for numerical, mode for categorical
    for col in X.columns:
        if X[col].isnull().sum() > 0:
            if X[col].dtype in ['int64', 'float64']:
                X[col].fillna(X[col].median(), inplace=True)
            else:
                X[col].fillna(X[col].mode()[0], inplace=True)
    print("✅ Missing values handled")

print(f"\n✅ Feature engineering complete!")
print(f"📊 Ready for model training with {X.shape[1]} features")


🎯 FEATURE SELECTION & PREPARATION
🎯 Target variable: Churn
   Class distribution: {0: 5174, 1: 1869}

📊 Feature Matrix:
   Shape: (7043, 65)
   Features: 65

📋 Engineered Features (65 total):

Original Numerical (4):
   • tenure
   • MonthlyCharges
   • TotalCharges
   • SeniorCitizen

Binary Encoded (4):
   • Partner_encoded
   • Dependents_encoded
   • PhoneService_encoded
   • PaperlessBilling_encoded

Service Features (15):
   • OnlineSecurity_binary
   • OnlineSecurity_available
   • OnlineBackup_binary
   • OnlineBackup_available
   • DeviceProtection_binary
   ... and 10 more

Risk Scores (7):
   • InternetService_risk
   • Contract_stability
   • PaymentMethod_risk
   • Contract_stability_normalized
   • PaymentMethod_risk_normalized
   ... and 2 more

Advanced Features (8):
   • service_adoption_rate
   • premium_services
   • has_premium
   • composite_risk_score
   • estimated_clv
   ... and 3 more

🔍 Missing values in feature matrix: 0

✅ Feature engineering complete!
📊 Rea

In [7]:
# ============================================
# CELL 7: Save Processed Data
# ============================================

print("💾 SAVING PROCESSED DATA")
print("=" * 30)

# Create processed data directory
import os
os.makedirs('../data/processed', exist_ok=True)

# Save feature matrix and target
X.to_csv('../data/processed/features.csv', index=False)
y.to_csv('../data/processed/target.csv', index=False)

# Save feature names for later use
feature_names = X.columns.tolist()
with open('../data/processed/feature_names.txt', 'w') as f:
    for name in feature_names:
        f.write(f"{name}\n")

# Save a summary of the feature engineering process
summary = {
    'original_shape': df.shape,
    'final_shape': X.shape,
    'features_created': X.shape[1] - (df.shape[1] - 1),  # -1 for target
    'target_distribution': y.value_counts().to_dict()
}

import json
with open('../data/processed/feature_engineering_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"✅ Processed data saved:")
print(f"   📊 Features: ../data/processed/features.csv ({X.shape})")
print(f"   🎯 Target: ../data/processed/target.csv ({y.shape})")
print(f"   📋 Feature names: ../data/processed/feature_names.txt")
print(f"   📈 Summary: ../data/processed/feature_engineering_summary.json")

# Display final summary
print(f"\n🎉 FEATURE ENGINEERING COMPLETE!")
print(f"=" * 50)
print(f"📊 Original dataset: {df.shape[0]:,} customers, {df.shape[1]} features")
print(f"🛠️ Engineered dataset: {X.shape[0]:,} customers, {X.shape[1]} features")
print(f"⭐ Features created: {X.shape[1] - (df.shape[1] - 1)}")
print(f"🎯 Target classes: {dict(y.value_counts())}")
print(f"\n🚀 Ready for model training phase!")

💾 SAVING PROCESSED DATA
✅ Processed data saved:
   📊 Features: ../data/processed/features.csv ((7043, 65))
   🎯 Target: ../data/processed/target.csv ((7043,))
   📋 Feature names: ../data/processed/feature_names.txt
   📈 Summary: ../data/processed/feature_engineering_summary.json

🎉 FEATURE ENGINEERING COMPLETE!
📊 Original dataset: 7,043 customers, 21 features
🛠️ Engineered dataset: 7,043 customers, 65 features
⭐ Features created: 45
🎯 Target classes: {0: np.int64(5174), 1: np.int64(1869)}

🚀 Ready for model training phase!
