In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import joblib

In [2]:
# Step 1: Load the data
df = pd.read_csv('./Customer - Telco Data.csv')

print("Dataset Info:")
print(df.info())
print("\nDataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

# Step 2: Define column categories
binary_columns = ['Churn', 'PhoneService', 'PaperlessBilling', 'Partner', 'Dependents']
other_categorical = ['MultipleLines', 'InternetService', 'Contract', 'PaymentMethod', 'gender', 
                    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                    'TechSupport', 'StreamingTV', 'StreamingMovies']

# Step 3: Create a copy for processing
df_encoded = df.copy()

# Step 4: Binary encoding (Yes/No columns)
binary_map = {'No': 0, 'Yes': 1}
print(f"\nApplying binary encoding to: {binary_columns}")
for col in binary_columns:
    if col in df_encoded.columns:
        df_encoded[col] = df_encoded[col].map(binary_map)
        print(f"✅ {col}: {df[col].unique()} -> {df_encoded[col].unique()}")

# Step 5: Label encoding for other categorical columns
label_encoders = {}
print(f"\nApplying label encoding to: {other_categorical}")

for col in other_categorical:
    if col in df_encoded.columns:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])
        label_encoders[col] = le  # Store the encoder
        
        # Show the mapping
        unique_original = df[col].unique()
        unique_encoded = le.transform(unique_original)
        mapping = dict(zip(unique_original, unique_encoded))
        print(f"\n✅ {col} mapping:")
        for original, encoded in mapping.items():
            print(f"   {original} -> {encoded}")

print(f"\nTotal label encoders created: {len(label_encoders)}")

# Step 6: Handle numeric columns and missing values
print("\nProcessing numeric columns...")
df_encoded['TotalCharges'] = pd.to_numeric(df_encoded['TotalCharges'], errors='coerce')
df_encoded['MonthlyCharges'] = pd.to_numeric(df_encoded['MonthlyCharges'], errors='coerce')
df_encoded['tenure'] = pd.to_numeric(df_encoded['tenure'], errors='coerce')

# Check for missing values
numeric_columns = ['TotalCharges', 'MonthlyCharges', 'tenure']
print("Missing values before filling:")
for col in numeric_columns:
    missing = df_encoded[col].isna().sum()
    print(f"  {col}: {missing} missing values")

# Fill missing values with mean
df_encoded[numeric_columns] = df_encoded[numeric_columns].fillna(df_encoded[numeric_columns].mean())

print("Missing values after filling:")
for col in numeric_columns:
    missing = df_encoded[col].isna().sum()
    print(f"  {col}: {missing} missing values")

# Step 7: Remove unnecessary columns
print(f"\nOriginal columns: {list(df_encoded.columns)}")
if 'customerID' in df_encoded.columns:
    df_encoded = df_encoded.drop('customerID', axis=1)
    print("✅ Removed customerID")

# Step 8: Separate features and target
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature columns: {list(X.columns)}")

# Step 9: Check for any remaining issues
print(f"\nData quality check:")
print(f"Features - NaN values: {X.isna().sum().sum()}")
print(f"Features - Infinite values: {np.any(np.isinf(X.select_dtypes(include=[np.number])))}")
print(f"Target distribution:\n{y.value_counts()}")

# Step 10: Scale the features
print("\nApplying StandardScaler...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Scaling completed:")
print(f"  Original shape: {X.shape}")
print(f"  Scaled shape: {X_scaled.shape}")
print(f"  Scaled mean (should be ~0): {np.mean(X_scaled, axis=0)[:5]}...")
print(f"  Scaled std (should be ~1): {np.std(X_scaled, axis=0)[:5]}...")

# Step 11: Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"\nData split:")
print(f"  Training set: {X_train.shape}")
print(f"  Validation set: {X_val.shape}")
print(f"  Test set: {X_test.shape}")

# Step 12: Save all preprocessors
print("\n" + "="*50)
print("SAVING PREPROCESSORS")
print("="*50)

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')
print("✅ Scaler saved as 'scaler.pkl'")

# Save the label encoders
joblib.dump(label_encoders, 'label_encoders.pkl')
print("✅ Label encoders saved as 'label_encoders.pkl'")

# Save comprehensive preprocessing information
preprocessing_info = {
    'binary_columns': binary_columns,
    'other_categorical': other_categorical,
    'binary_map': binary_map,
    'feature_columns': list(X.columns),
    'input_shape': X_scaled.shape[1],
    'target_column': 'Churn',
    'numeric_columns': numeric_columns,
    'expected_feature_order': list(X.columns)
}

joblib.dump(preprocessing_info, 'preprocessing_info.pkl')
print("✅ Preprocessing info saved as 'preprocessing_info.pkl'")

# Step 13: Verify saved files
print("\nVerifying saved files...")
try:
    loaded_scaler = joblib.load('scaler.pkl')
    loaded_encoders = joblib.load('label_encoders.pkl')
    loaded_info = joblib.load('preprocessing_info.pkl')
    
    print("✅ All files loaded successfully!")
    print(f"✅ Scaler type: {type(loaded_scaler)}")
    print(f"✅ Number of label encoders: {len(loaded_encoders)}")
    print(f"✅ Input features: {loaded_info['input_shape']}")
    print(f"✅ Feature columns: {len(loaded_info['feature_columns'])}")
    
except Exception as e:
    print(f"❌ Error loading files: {e}")

# Step 14: Display final summary
print("\n" + "="*50)
print("PREPROCESSING SUMMARY")
print("="*50)
print(f"📊 Dataset: {df.shape[0]} rows, {df.shape[1]} original columns")
print(f"🔢 Final features: {X_scaled.shape[1]} columns")
print(f"📁 Files saved:")
print(f"   • scaler.pkl (StandardScaler)")
print(f"   • label_encoders.pkl ({len(label_encoders)} LabelEncoders)")
print(f"   • preprocessing_info.pkl (metadata)")
print(f"🎯 Target: {y.value_counts().to_dict()}")
print(f"✨ Data ready for model training!")

# Step 15: Create a preview function for new data preprocessing
def preprocess_new_data_preview(sample_data):
    """
    Show how new data would be preprocessed
    """
    print("\n" + "="*40)
    print("NEW DATA PREPROCESSING PREVIEW")
    print("="*40)
    print("Your new data should have these columns:")
    for i, col in enumerate(X.columns, 1):
        print(f"{i:2d}. {col}")
    
    print(f"\nExpected input shape for model: (batch_size, {X_scaled.shape[1]})")
    print("Data should be scaled using the saved scaler.")

# Run the preview
preprocess_new_data_preview(None)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-n