# Dataset Loading
### Load the telecom customer churn dataset from local storage.


In [2]:
"""
COMPREHENSIVE IMPORT STATEMENTS FOR CHURN PREDICTION PIPELINE
Organized by functionality and usage category
"""

# ============================================================================
# 1. SCIKIT-LEARN CORE MODULES
# ============================================================================
from sklearn.model_selection import (
    StratifiedKFold,        # For stratified cross-validation
    GridSearchCV,           # Exhaustive hyperparameter search
    RandomizedSearchCV,     # Random hyperparameter search
    train_test_split        # Train/test splitting
)

from sklearn.feature_selection import (
    mutual_info_classif     # Mutual information for feature selection
)

from sklearn.metrics import (
    # Core classification metrics
    fbeta_score,           # F-beta score (F2 for recall focus)
    f1_score,              # F1 score (harmonic mean)
    make_scorer,           # Custom scoring functions
    recall_score,          # Sensitivity/true positive rate
    precision_score,       # Positive predictive value
    roc_auc_score,         # Area under ROC curve
    accuracy_score,        # Overall accuracy
    confusion_matrix       # TP, FP, TN, FN matrix
)

from sklearn.preprocessing import (
    StandardScaler         # Feature standardization (z-score)
)

from sklearn.linear_model import (
    LogisticRegression     # Primary linear model
)

from sklearn.inspection import (
    permutation_importance # Feature importance via permutation
)

from sklearn.ensemble import (
    RandomForestClassifier # Ensemble tree model for comparison
)

from sklearn.tree import (
    _tree                  # Internal tree utilities
)

# ============================================================================
# 2. SCIENTIFIC COMPUTING & STATISTICS
# ============================================================================
from scipy import stats
from scipy.stats import randint  # Random integer distributions

import numpy as np               # Numerical computing
import pandas as pd              # Data manipulation

# ============================================================================
# 3. VISUALIZATION
# ============================================================================
import matplotlib.pyplot as plt  # Plotting and visualization
import seaborn as sns            # Statistical visualization

# ============================================================================
# 4. INTERPRETABLE ML (OPTIONAL - INSTALL IF MISSING)
# ============================================================================
try:
    from interpret.glassbox import ExplainableBoostingClassifier
    INTERPRET_AVAILABLE = True
except ImportError:
    print("üì¶ interpret library not found. Install for explainable models...")
    print("   Run: pip install interpret")
    INTERPRET_AVAILABLE = False
    # Define placeholder if not available
    ExplainableBoostingClassifier = None

# ============================================================================
# 5. UTILITIES & SYSTEM
# ============================================================================
import time          # Timing and profiling
import warnings      # Warning control
import sys           # System utilities
import subprocess    # Subprocess management

# ============================================================================
# 6. CONFIGURATION
# ============================================================================
# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# ============================================================================
# 7. PRINT IMPORT STATUS
# ============================================================================
print("‚úÖ IMPORTS SUCCESSFUL - CHURN PREDICTION PIPELINE READY")
print(f"   ‚Ä¢ Scikit-learn modules: Complete")
print(f"   ‚Ä¢ Scientific computing: NumPy {np.__version__}, Pandas {pd.__version__}")
print(f"   ‚Ä¢ Visualization: Matplotlib {plt.matplotlib.__version__}, Seaborn {sns.__version__}")
print(f"   ‚Ä¢ Interpretable ML: {'Available' if INTERPRET_AVAILABLE else 'Not installed'}")
print(f"   ‚Ä¢ Random seed set: 42 (for reproducibility)")
print("-" * 60)

# Set global random seed for reproducibility
np.random.seed(42)



üì¶ interpret library not found. Install for explainable models...
   Run: pip install interpret
‚úÖ IMPORTS SUCCESSFUL - CHURN PREDICTION PIPELINE READY
   ‚Ä¢ Scikit-learn modules: Complete
   ‚Ä¢ Scientific computing: NumPy 2.0.2, Pandas 2.2.2
   ‚Ä¢ Visualization: Matplotlib 3.10.0, Seaborn 0.13.2
   ‚Ä¢ Interpretable ML: Not installed
   ‚Ä¢ Random seed set: 42 (for reproducibility)
------------------------------------------------------------


# Dataset Loading
### Load the telecom customer churn dataset from local storage.


In [3]:
# Load dataset
print("\nüìÇ Loading telecom dataset...\n")
try:
    data = pd.read_csv('/content/sample_data/telecom_customer_churn.csv')
    print(f"‚úì Dataset loaded: {data.shape[0]} rows, {data.shape[1]} columns")

    # CHECK FOR NAN VALUES IN ALL COLUMNS
    print("\n" + "="*80)
    print("üîç CHECKING NaN/EMPTY VALUES IN ALL COLUMNS")
    print("="*80)

    total_nan = 0
    total_empty = 0

    print("\n{:30} {:12} {:15} {:10}".format(
        "COLUMN NAME", "DATA TYPE", "NaN COUNT", "EMPTY COUNT"))
    print("-" * 70)

    for col in data.columns:
        # Count NaN values
        nan_count = data[col].isnull().sum()
        total_nan += nan_count

        # Count empty strings (for string columns)
        if data[col].dtype == 'object':
            empty_count = (data[col].astype(str).str.strip() == '').sum()
            total_empty += empty_count
        else:
            empty_count = 0

        # Only show columns with issues, or all if you want
        if nan_count > 0 or empty_count > 0:
            print("{:30} {:12} {:15} {:10}".format(
                col,
                str(data[col].dtype),
                str(nan_count),
                str(empty_count)
            ))

            # Show sample values for problematic columns
            if nan_count > 0:
                print("     Sample of NaN rows indices:",
                      data[data[col].isnull()].index[:5].tolist())
            if empty_count > 0:
                empty_rows = data[data[col].astype(str).str.strip() == ''].index[:3]
                if len(empty_rows) > 0:
                    print("     First 3 empty string rows indices:", empty_rows.tolist())

    print("\n" + "="*80)
    print("üìä SUMMARY:")
    print(f"Total NaN values in dataset: {total_nan}")
    print(f"Total empty strings in dataset: {total_empty}")
    print(f"Total rows with ANY missing data: {data.isnull().any(axis=1).sum()}")
    print("="*80)

    # Optional: Show first few rows for context
    print("\nüìã FIRST 5 ROWS OF DATA:")
    print(data.head())

    # Optional: Show data types summary
    print("\nüìä DATA TYPES SUMMARY:")
    print(data.dtypes.value_counts())

except Exception as e:
    print(f"‚úó Error loading dataset: {e}")
    print(f"Error details: {type(e).__name__}")
    data = None


üìÇ Loading telecom dataset...

‚úì Dataset loaded: 7043 rows, 37 columns

üîç CHECKING NaN/EMPTY VALUES IN ALL COLUMNS

COLUMN NAME                    DATA TYPE    NaN COUNT       EMPTY COUNT
----------------------------------------------------------------------
Offer                          object       3877            0         
     Sample of NaN rows indices: [0, 1, 4, 9, 10]
AvgMonthlyLongDistanceCharges  float64      682             0         
     Sample of NaN rows indices: [10, 14, 16, 19, 25]
MultipleLines                  object       682             0         
     Sample of NaN rows indices: [10, 14, 16, 19, 25]
InternetType                   object       1526            0         
     Sample of NaN rows indices: [20, 23, 24, 27, 28]
AvgMonthlyGBDownload           float64      1526            0         
     Sample of NaN rows indices: [20, 23, 24, 27, 28]
OnlineSecurity                 object       1526            0         
     Sample of NaN rows indices: [20, 23,

# Data Preprocessing Notebook

## Overview
This notebook handles the first stage of our 4-stage cascaded architecture for churn prediction.

### Main Objectives:
- Data cleaning and preprocessing with distribution preservation
- Comprehensive exploratory data analysis (EDA)
- Churn distribution analysis across all features
- Statistical significance testing

### Output:
- Preprocessed dataset saved as `churn_data_preprocessed.csv`
- Ready for downstream analysis in other notebooks

In [4]:
# =============================================================================
# DATA PREPROCESSING
# =============================================================================
# This cell handles the first stage of our 4-stage cascaded architecture:
# - Data cleaning and preprocessing with distribution preservation
# - Comprehensive exploratory data analysis (EDA)
# - Churn distribution analysis across all features
# - Statistical significance testing
# =============================================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")

# =============================================================================
# SECTION 1: DATA PREPROCESSING FUNCTION
# =============================================================================
# This function performs complete preprocessing while preserving data distributions
# Key strategy: Use MODE for categorical variables, MEDIAN for numeric variables
# =============================================================================

def preprocessing(df):
    """
    Complete preprocessing pipeline that preserves data distributions.
    """

    df_processed = df.copy()
    imputation_log = []

    # -------------------------------------------------------------------------
    # 1. TARGET VARIABLE: Churn
    # -------------------------------------------------------------------------
    print("üìù Processing target variable: Churn")

    if 'Churn' in df_processed.columns:
        # Check if Churn is already numeric
        if df_processed['Churn'].dtype in ['int64', 'float64']:
            print("   ‚úì Churn is already numeric")
            df_processed['Churn'] = df_processed['Churn'].astype(int)
        else:
            churn_map = {'Churned': 1, 'Stayed': 0, 'Joined': 0}
            df_processed['Churn'] = df_processed['Churn'].map(churn_map)
            df_processed['Churn'] = df_processed['Churn'].fillna(0).astype(int)
            print("   ‚úì Churn mapped from strings to numeric")

        print(f"   Churn unique values: {sorted(df_processed['Churn'].unique())}")

    # -------------------------------------------------------------------------
    # 2. BINARY CATEGORICAL VARIABLES - SIMPLIFIED
    # -------------------------------------------------------------------------
    print("\nüìù Processing binary categorical variables")

    binary_columns = [
        'Married', 'PhoneService', 'MultipleLines', 'OnlineSecurity',
        'OnlineBackup', 'DeviceProtectionPlan', 'PremiumTechSupport',
        'StreamingTV', 'StreamingMovies', 'StreamingMusic', 'UnlimitedData',
        'PaperlessBilling', 'InternetService'
    ]

    for col in binary_columns:
        if col in df_processed.columns:
            # SIMPLE: Check if contains Yes/No, then map, else fill with 0
            if df_processed[col].isin(['Yes', 'No']).any():
                df_processed[col] = df_processed[col].map({'Yes': 1, 'No': 0})
                print(f"   ‚úì {col}: Yes/No mapped to 1/0")
            else:
                # If not Yes/No, it's probably already numeric or has NaN
                print(f"   ‚ö†Ô∏è {col}: Not a Yes/No column")

            # Convert to numeric and fill NaN with 0
            df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')

            if df_processed[col].isnull().any():
                nan_count = df_processed[col].isnull().sum()
                df_processed[col] = df_processed[col].fillna(0)
                imputation_log.append(f"{col}: {nan_count} filled with 0")

            df_processed[col] = df_processed[col].astype(int)
            print(f"   ‚úì {col}: Final values {sorted(df_processed[col].unique())}")

    # -------------------------------------------------------------------------
    # 3. MULTI-CLASS CATEGORICAL VARIABLES
    # -------------------------------------------------------------------------
    print("\nüìù Processing multi-class categorical variables")

    # Gender: Female=0, Male=1
    if 'Gender' in df_processed.columns:
        gender_map = {'Female': 0, 'Male': 1}
        df_processed['Gender'] = df_processed['Gender'].map(gender_map)
        df_processed['Gender'] = df_processed['Gender'].fillna(0).astype(int)
        print(f"   ‚úì Gender: Female=0, Male=1, values: {sorted(df_processed['Gender'].unique())}")

    # Contract: Month-to-Month=0, One Year=1, Two Year=2
    if 'Contract' in df_processed.columns:
        print(f"   Contract sample values: {df_processed['Contract'].head(5).tolist()}")

        # Fill empty strings/NaN with 'Month-to-Month' (most common) instead of 0
        df_processed['Contract'] = df_processed['Contract'].fillna('Month-to-Month')
        df_processed['Contract'] = df_processed['Contract'].replace('', 'Month-to-Month')

        contract_map = {
            'Month-to-Month': 0,
            'One Year': 1,
            'Two Year': 2
        }
        df_processed['Contract'] = df_processed['Contract'].map(contract_map)

        # Fill any remaining missing with 0 (Month-to-Month)
        df_processed['Contract'] = df_processed['Contract'].fillna(0).astype(int)
        print(f"   ‚úì Contract: Mapped Month-to-Month=0, One Year=1, Two Year=2")
        print(f"   Contract final values: {sorted(df_processed['Contract'].unique())}")

    # Offer: Fill NaN with 0, then map
    if 'Offer' in df_processed.columns:
        df_processed['Offer'] = df_processed['Offer'].fillna('No Offer')
        offer_map = {
            'No Offer': 0, 'None': 0,
            'Offer A': 1, 'Offer B': 2, 'Offer C': 3,
            'Offer D': 4, 'Offer E': 5
        }
        df_processed['Offer'] = df_processed['Offer'].map(offer_map)
        df_processed['Offer'] = df_processed['Offer'].fillna(0).astype(int)
        print(f"   ‚úì Offer: Mapped with {df_processed['Offer'].nunique()} categories")

    # InternetType: Fill NaN with 0
    if 'InternetType' in df_processed.columns:
        df_processed['InternetType'] = df_processed['InternetType'].fillna('No Internet')
        internet_map = {
            'No Internet': 0, 'DSL': 1, 'Cable': 2, 'Fiber Optic': 3
        }
        df_processed['InternetType'] = df_processed['InternetType'].map(internet_map)
        df_processed['InternetType'] = df_processed['InternetType'].fillna(0).astype(int)
        print(f"   ‚úì InternetType: Mapped with {df_processed['InternetType'].nunique()} categories")

    # PaymentMethod
    if 'PaymentMethod' in df_processed.columns:
        payment_map = {
            'Bank Withdrawal': 0,
            'Credit Card': 1,
            'Mailed Check': 2
        }
        df_processed['PaymentMethod'] = df_processed['PaymentMethod'].map(payment_map)
        df_processed['PaymentMethod'] = df_processed['PaymentMethod'].fillna(0).astype(int)
        print(f"   ‚úì PaymentMethod: Mapped with {df_processed['PaymentMethod'].nunique()} categories")

    # -------------------------------------------------------------------------
    # 4. NUMERIC VARIABLES - SIMPLIFIED
    # -------------------------------------------------------------------------
    print("\nüìù Processing numeric variables")

    numeric_columns = [
        'Age', 'NumberofDependents', 'Population', 'NumberofReferrals',
        'TenureinMonths', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload',
        'MonthlyCharge', 'TotalCharges', 'TotalRefunds', 'TotalExtraDataCharges',
        'TotalLongDistanceCharges', 'TotalRevenue'
    ]

    for col in numeric_columns:
        if col in df_processed.columns:
            # Convert to numeric, fill NaN with 0
            df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')

            if df_processed[col].isnull().any():
                nan_count = df_processed[col].isnull().sum()
                df_processed[col] = df_processed[col].fillna(0)
                imputation_log.append(f"{col}: {nan_count} filled with 0")

            print(f"   ‚úì {col}: Processed as numeric")

    # -------------------------------------------------------------------------
    # 5. TEXT VARIABLES - DROP UNNECESSARY COLUMNS
    # -------------------------------------------------------------------------
    print("\nüìù Processing text variables")

    text_columns = ['CustomerID', 'City', 'ZipCode', 'ChurnCategory', 'ChurnReason']

    for col in text_columns:
        if col in df_processed.columns:
            df_processed = df_processed.drop(col, axis=1)
            print(f"   ‚úì Dropped {col}")

    # -------------------------------------------------------------------------
    # 6. FINAL CLEANUP
    # -------------------------------------------------------------------------
    print("\nüìù Final cleanup: checking for remaining missing values")

    # Fill any remaining NaN with 0
    for col in df_processed.columns:
        if df_processed[col].isnull().any():
            nan_count = df_processed[col].isnull().sum()
            df_processed[col] = df_processed[col].fillna(0)
            imputation_log.append(f"{col}: {nan_count} remaining filled with 0")

    # Convert float columns to int where appropriate
    for col in df_processed.columns:
        if df_processed[col].dtype == 'float64':
            if df_processed[col].apply(lambda x: x.is_integer() if not pd.isna(x) else True).all():
                df_processed[col] = df_processed[col].astype(int)

    # -------------------------------------------------------------------------
    # IMPUTATION SUMMARY
    # -------------------------------------------------------------------------
    if imputation_log:
        print("\nüìä IMPUTATION SUMMARY:")
        for log_entry in imputation_log:
            print(f"   ‚Ä¢ {log_entry}")
    else:
        print("\n‚úÖ No imputation needed - dataset was clean!")

    return df_processed

# =============================================================================
# APPLY PREPROCESSING
# =============================================================================

print("=" * 80)
print("üîÑ STAGE 1: DATA PREPROCESSING")
print("=" * 80)

# Apply preprocessing function
data = preprocessing(data)

# Save the preprocessed data for use in other notebooks
data.to_csv('churn_data_preprocessed.csv', index=False)
print("‚úÖ Preprocessed data saved to 'churn_data_preprocessed.csv'")

print(f"\n‚úÖ PREPROCESSING COMPLETE!")
print(f"   Final Shape: {data.shape}")
print(f"   Features: {data.shape[1]}")
print(f"   Samples: {data.shape[0]}")
print(f"   Overall Churn Rate: {data['Churn'].mean():.2%}")
print(f"   Missing Values: {data.isnull().sum().sum()}")

üîÑ STAGE 1: DATA PREPROCESSING
üìù Processing target variable: Churn
   ‚úì Churn mapped from strings to numeric
   Churn unique values: [np.int64(0), np.int64(1)]

üìù Processing binary categorical variables
   ‚úì Married: Yes/No mapped to 1/0
   ‚úì Married: Final values [np.int64(0), np.int64(1)]
   ‚úì PhoneService: Yes/No mapped to 1/0
   ‚úì PhoneService: Final values [np.int64(0), np.int64(1)]
   ‚úì MultipleLines: Yes/No mapped to 1/0
   ‚úì MultipleLines: Final values [np.int64(0), np.int64(1)]
   ‚úì OnlineSecurity: Yes/No mapped to 1/0
   ‚úì OnlineSecurity: Final values [np.int64(0), np.int64(1)]
   ‚úì OnlineBackup: Yes/No mapped to 1/0
   ‚úì OnlineBackup: Final values [np.int64(0), np.int64(1)]
   ‚úì DeviceProtectionPlan: Yes/No mapped to 1/0
   ‚úì DeviceProtectionPlan: Final values [np.int64(0), np.int64(1)]
   ‚úì PremiumTechSupport: Yes/No mapped to 1/0
   ‚úì PremiumTechSupport: Final values [np.int64(0), np.int64(1)]
   ‚úì StreamingTV: Yes/No mapped to 1/0
 

#*All data preprocessed and ready for churn analysis and downstream model stages.*