#Load Dataset
The first step involves importing the necessary libraries and loading the raw dataset into a pandas DataFrame. This process reads the data from a CSV file located at a specified path and prepares it for analysis.

In [12]:
import pandas as pd
import numpy as np

# Load dataset
file_path = '/content/sample_data/preprocessed_telco_churn.csv'
df = pd.read_csv(file_path)

# Preprocessing the Dataset
The code drops the customerID column, converts TotalCharges to numeric filling missing values with the median, maps binary 'Yes'/'No' columns to integers, and one-hot encodes multi-class categorical columns while dropping the first category. Boolean columns are converted to integers to ensure consistent numeric data.



In [13]:
# Combine all "No internet service" columns into one feature
no_internet_cols = [
    'InternetService_No',
    'StreamingTV_No internet service',
    'OnlineSecurity_No internet service',
    'OnlineBackup_No internet service',
    'DeviceProtection_No internet service',
    'StreamingMovies_No internet service',
    'TechSupport_No internet service'
]
df['NoInternetServiceFlag'] = df[no_internet_cols].max(axis=1)
df.drop(columns=no_internet_cols, inplace=True)

# Print correlation with Churn
corr_list = df.corr()['Churn'].sort_values(ascending=False)
print(corr_list)


Churn                                    1.000000
InternetService_Fiber optic              0.308020
PaymentMethod_Electronic check           0.301919
MonthlyCharges                           0.193356
PaperlessBilling_Yes                     0.191825
SeniorCitizen                            0.150889
StreamingTV_Yes                          0.063228
StreamingMovies_Yes                      0.061382
MultipleLines_Yes                        0.040102
PhoneService_Yes                         0.011942
gender                                  -0.008612
MultipleLines_No phone service          -0.011942
DeviceProtection_Yes                    -0.066160
OnlineBackup_Yes                        -0.082255
PaymentMethod_Mailed check              -0.091683
PaymentMethod_Credit card (automatic)   -0.134302
Partner_Yes                             -0.150448
Dependents_Yes                          -0.164221
TechSupport_Yes                         -0.164674
OnlineSecurity_Yes                      -0.171226


In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, roc_curve
from sklearn.datasets import make_classification
import warnings
warnings.filterwarnings('ignore')

# ===========================
# 1. FEATURE ENGINEERING
# ===========================
def engineer_churn_features(df):
    df_new = df.copy()
    # Lifecycle & value
    df_new['CustomerLifecycleStage'] = pd.cut(df_new['tenure'], bins=[0,6,12,24,72],
                                              labels=['New','Growing','Mature','Loyal'])
    df_new['CustomerValue'] = pd.qcut(df_new['TotalCharges'], q=4,
                                      labels=['Low','Medium','High','Premium'])
    # Service complexity
    service_cols = [c for c in ['PhoneService_Yes','StreamingTV_Yes','StreamingMovies_Yes',
                                'OnlineSecurity_Yes','OnlineBackup_Yes','DeviceProtection_Yes','TechSupport_Yes']
                    if c in df_new.columns]
    df_new['ServiceComplexity'] = df_new[service_cols].sum(axis=1)
    # Interaction / high-risk features
    df_new['FiberElectronicPay'] = df_new.get('InternetService_Fiber optic',0) * df_new.get('PaymentMethod_Electronic check',0)
    df_new['SeniorFiberCombo'] = df_new.get('SeniorCitizen',0) * df_new.get('InternetService_Fiber optic',0)
    df_new['PaperlessHighCharges'] = df_new.get('PaperlessBilling_Yes',0) * (df_new['MonthlyCharges'] > df_new['MonthlyCharges'].quantile(0.75)).astype(int)
    df_new['LoyalSecureCustomer'] = df_new.get('Contract_Two year',0) * df_new.get('OnlineSecurity_Yes',0)
    df_new['FamilyStability'] = df_new.get('Partner_Yes',0) * df_new.get('Dependents_Yes',0)
    # Safe ratio features
    df_new['MonthlyCharges_safe'] = df_new['MonthlyCharges'].replace(0, df_new['MonthlyCharges'].median())
    df_new['TotalCharges_safe'] = df_new['TotalCharges'].replace(0, df_new['TotalCharges'].median())
    df_new['tenure_safe'] = df_new['tenure'].replace(0,1)
    df_new['AvgMonthlyRevenue'] = df_new['TotalCharges_safe'] / df_new['tenure_safe']
    df_new['RevenuePerService'] = df_new['MonthlyCharges_safe'] / (df_new['ServiceComplexity']+1)
    df_new['MonthlyToTotalRatio'] = df_new['MonthlyCharges_safe'] / df_new['TotalCharges_safe']
    df_new['ChargesPerTenureMonth'] = df_new['TotalCharges_safe'] / df_new['tenure_safe']
    df_new['TenureToChargesRatio'] = df_new['tenure_safe'] / (df_new['MonthlyCharges_safe']/100)
    df_new['ChargeEvolution'] = df_new['MonthlyCharges_safe'] - df_new['AvgMonthlyRevenue']
    # Polynomial & binned features
    df_new['MonthlyCharges_squared'] = df_new['MonthlyCharges'] ** 2
    df_new['tenure_squared'] = df_new['tenure'] ** 2
    df_new['TotalCharges_log'] = np.log1p(df_new['TotalCharges_safe'])
    df_new['MonthlyCharges_binned'] = pd.cut(df_new['MonthlyCharges'], bins=5, labels=False)
    df_new['tenure_binned'] = pd.cut(df_new['tenure'], bins=[0,12,24,48,72], labels=False)
    # Risk scoring
    risk_cols = [c for c in ['InternetService_Fiber optic','PaymentMethod_Electronic check','PaperlessBilling_Yes','SeniorCitizen'] if c in df_new.columns]
    retention_cols = [c for c in ['Contract_Two year','OnlineSecurity_Yes','TechSupport_Yes'] if c in df_new.columns]
    df_new['ChurnRiskScore'] = df_new[risk_cols].sum(axis=1) if risk_cols else 0
    df_new['RetentionScore'] = df_new[retention_cols].sum(axis=1) if retention_cols else 0
    df_new['NetRiskScore'] = df_new['ChurnRiskScore'] - df_new['RetentionScore']
    # Temporal features
    df_new['EarlyTenure'] = (df_new['tenure'] <= 12).astype(int)
    df_new['LongTenure'] = (df_new['tenure'] >= 48).astype(int)
    # Drop helpers
    df_new.drop(['MonthlyCharges_safe','TotalCharges_safe','tenure_safe'], axis=1, inplace=True)
    return df_new

# ===========================
# 2. FEATURE SELECTION
# ===========================
def select_best_features(X, y, k=20, method='mutual_info'):
    if method=='f_classif':
        selector = SelectKBest(f_classif, k=k)
    else:
        selector = SelectKBest(mutual_info_classif, k=k)
    X_selected = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()]
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'score': selector.scores_,
        'selected': selector.get_support()
    }).sort_values('score', ascending=False)
    return X_selected, selected_features, feature_importance

# ===========================
# 3. FEATURE CORRELATION
# ===========================
def print_feature_correlations(df, target_col='Churn'):
    numeric_cols = df.select_dtypes(include='number').columns.drop(target_col, errors='ignore')
    corr = df[numeric_cols].corrwith(pd.to_numeric(df[target_col], errors='coerce')).sort_values(ascending=False)
    print(f"Feature correlations with target '{target_col}':")
    for f,v in corr.items():
        print(f"{f}: {v:.6f}")

# ===========================
# 4. VALIDATION
# ===========================
def validate_feature_engineering(df, target_col='Churn'):
    le = LabelEncoder()
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    for col in X.select_dtypes(include=['object']).columns:
        X[col] = le.fit_transform(X[col].astype(str))
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    scores = cross_val_score(rf, X, y, cv=5, scoring='roc_auc')
    print(f"Mean AUC: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")
    return scores

# ===========================
# 5. FULL PROCESS PIPELINE
# ===========================
def process_churn_data(df, target_col='Churn', k=20):
    df_eng = engineer_churn_features(df)
    X = df_eng.drop(columns=[target_col]) if target_col in df_eng.columns else df_eng
    y = df_eng[target_col] if target_col in df_eng.columns else None
    numeric_cols = X.select_dtypes(include='number').columns
    X_numeric = X[numeric_cols]
    X_sel, sel_features, feat_imp = select_best_features(X_numeric, y, k=k)
    print("Top 10 features by importance:")
    print(feat_imp.head(10))
    print_feature_correlations(df_eng, target_col)
    return X_sel, sel_features, feat_imp

# ===========================
# 6. BASELINE + ADVANCED MODELS
# ===========================
def run_model_comparison(df, target_col='Churn'):
    # Label encode non-numeric
    le = LabelEncoder()
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    for col in X.select_dtypes(include=['object']).columns:
        X[col] = le.fit_transform(X[col].astype(str))

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Baseline models
    baseline_models = {
        'Lasso Logistic': Pipeline([('scaler', StandardScaler()),
                                    ('classifier', LogisticRegression(penalty='l1', solver='liblinear', random_state=42))]),
        'Neural Network': Pipeline([('scaler', StandardScaler()),
                                    ('classifier', MLPClassifier(hidden_layer_sizes=(100,50), max_iter=500, random_state=42))])
    }

    baseline_results = {}
    for name, model in baseline_models.items():
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_test)[:,1]
        test_auc = roc_auc_score(y_test, y_pred_proba)
        baseline_results[name] = {'test_auc': test_auc, 'model': model, 'y_pred_proba': y_pred_proba}
        print(f"{name}: {test_auc:.4f} AUC")

    # Advanced models
    advanced_models = {
        'SVM (RBF)': Pipeline([('scaler', StandardScaler()), ('classifier', SVC(kernel='rbf', probability=True, random_state=42))]),
        'SVM (Linear)': Pipeline([('scaler', StandardScaler()), ('classifier', SVC(kernel='linear', probability=True, random_state=42))]),
        'Random Forest': Pipeline([('classifier', RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10))]),
        'Gradient Boosting': Pipeline([('scaler', StandardScaler()), ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42))]),
        'KNN': Pipeline([('scaler', StandardScaler()), ('classifier', KNeighborsClassifier(n_neighbors=5, weights='distance'))]),
        'Naive Bayes': Pipeline([('scaler', StandardScaler()), ('classifier', GaussianNB())])
    }

    advanced_results = {}
    for name, model in advanced_models.items():
        try:
            cv_auc = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:,1] if hasattr(model, 'predict_proba') else y_pred
            test_auc = roc_auc_score(y_test, y_pred_proba)
            test_f1 = f1_score(y_test, y_pred)
            test_accuracy = accuracy_score(y_test, y_pred)
            advanced_results[name] = {'cv_auc_mean': cv_auc.mean(),
                                      'cv_auc_std': cv_auc.std(),
                                      'test_auc': test_auc,
                                      'test_f1': test_f1,
                                      'test_accuracy': test_accuracy,
                                      'y_pred_proba': y_pred_proba,
                                      'model': model}
            print(f"{name}: {test_auc:.4f} AUC, F1 {test_f1:.4f}")
        except Exception as e:
            print(f"{name} failed: {e}")
            continue

    # Combine all results
    all_results = {**baseline_results, **advanced_results}
    return X_train, X_test, y_train, y_test, all_results





In [15]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif

def analyze_feature_importance(df, target_col='Churn'):
    """
    Converts all features to numeric, then prints:
      1. Mutual Information scores
      2. Correlations with target
    """
    # --------------------------
    # 1. Convert features to numeric
    # --------------------------
    df_numeric = df.copy()
    for col in df_numeric.columns:
        if df_numeric[col].dtype == 'bool' or df_numeric[col].dtype.name == 'category':
            df_numeric[col] = df_numeric[col].astype(int)
        elif df_numeric[col].dtype == 'object':
            df_numeric[col] = pd.factorize(df_numeric[col])[0]

    # --------------------------
    # 2. Separate target
    # --------------------------
    y = df_numeric[target_col]
    X = df_numeric.drop(columns=[target_col])

    # --------------------------
    # 3. Mutual Information scores
    # --------------------------
    selector = SelectKBest(score_func=mutual_info_classif, k='all')
    selector.fit(X, y)
    mi_scores = pd.DataFrame({
        'Feature': X.columns,
        'MI_Score': selector.scores_
    }).sort_values(by='MI_Score', ascending=False)

    print("=== Mutual Information Scores ===")
    print(mi_scores)

    # --------------------------
    # 4. Correlation with target
    # --------------------------
    corr_with_target = X.corrwith(y).sort_values(ascending=False)
    print("\n=== Correlations with Target ===")
    for feature, corr_val in corr_with_target.items():
        print(f"{feature}: {corr_val:.6f}")

    return mi_scores, corr_with_target

# Example usage:
mi_scores, corr_with_target = analyze_feature_importance(df, target_col='Churn')


=== Mutual Information Scores ===
                                  Feature  MI_Score
2                                  tenure  0.068766
18                      Contract_Two year  0.065687
3                          MonthlyCharges  0.047859
4                            TotalCharges  0.044007
10            InternetService_Fiber optic  0.043970
21         PaymentMethod_Electronic check  0.041372
23                  NoInternetServiceFlag  0.027376
17                      Contract_One year  0.019281
11                     OnlineSecurity_Yes  0.019160
14                        TechSupport_Yes  0.018466
19                   PaperlessBilling_Yes  0.017790
6                          Dependents_Yes  0.011510
20  PaymentMethod_Credit card (automatic)  0.010497
1                           SeniorCitizen  0.009988
16                    StreamingMovies_Yes  0.008576
15                        StreamingTV_Yes  0.007532
5                             Partner_Yes  0.007142
12                       Onlin

In [16]:
def process_churn_data_with_correlation(df, target_col='Churn'):
    """
    Full churn pipeline: feature engineering + correlation printout + feature selection
    """
    # Apply feature engineering
    df_engineered = engineer_churn_features(df)

    # Separate target
    if target_col in df_engineered.columns:
        y = df_engineered[target_col]
        X = df_engineered.drop(target_col, axis=1)

        # Select numeric columns
        numeric_cols = X.select_dtypes(include=[np.number]).columns
        X_numeric = X[numeric_cols]

        # -------------------------------
        # 1️⃣ Correlation with target
        # -------------------------------
        corr_with_target = X_numeric.corrwith(y).sort_values(ascending=False)
        print("Feature correlation with target (Churn):")
        print(corr_with_target)

        # -------------------------------
        # 2️⃣ Feature selection
        # -------------------------------
        X_selected, selected_features, feature_importance = select_best_features(
            X_numeric, y, k=min(20, len(numeric_cols)), method='mutual_info'
        )

        print("\nTop 10 features by importance (mutual_info):")
        print(feature_importance.head(10))

        return X_selected, selected_features, feature_importance, corr_with_target

    return df_engineered, None, None, None


In [None]:
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
import pandas as pd
import numpy as np

# --- 1️⃣ Mutual Information with target ---
mi_target = mutual_info_classif(X_train, y_train, discrete_features='auto', random_state=42)
mi_target_df = pd.DataFrame({
    "feature": X_train.columns,
    "MI_with_target": mi_target
}).sort_values(by="MI_with_target", ascending=False)
print("🔹 Mutual Information with target:")
print(mi_target_df)

# --- 2️⃣ Mutual Information between features ---
mi_matrix = pd.DataFrame(index=X_train.columns, columns=X_train.columns, dtype=float)

for f1 in X_train.columns:
    for f2 in X_train.columns:
        if f1 == f2:
            mi_matrix.loc[f1, f2] = 0
        else:
            # Use regression MI since features can be continuous/binary
            mi_matrix.loc[f1, f2] = mutual_info_regression(X_train[[f1]], X_train[f2])[0]

print("\n🔹 Mutual Information between features (pairwise):")
print(mi_matrix)

# --- 3️⃣ Flag highly redundant features ---
threshold = 0.8  # adjust based on your dataset
redundant_pairs = []

for i in mi_matrix.index:
    for j in mi_matrix.columns:
        if i != j and mi_matrix.loc[i,j] > threshold:
            redundant_pairs.append((i,j,mi_matrix.loc[i,j]))

redundant_df = pd.DataFrame(redundant_pairs, columns=["Feature1", "Feature2", "MI"])
redundant_df = redundant_df.sort_values(by="MI", ascending=False)
print("\n🔹 Highly redundant feature pairs (MI > {:.2f}):".format(threshold))
print(redundant_df)


🔹 Mutual Information with target:
                     feature  MI_with_target
12      FiberElectronicCombo        0.151824
5   PaymentMethod_Electronic        0.145922
6           Contract_TwoYear        0.131499
4      InternetService_Fiber        0.120153
3              SeniorCitizen        0.104264
11       MonthlyToTotalRatio        0.035344
13          HighRiskCustomer        0.018273
8                StreamingTV        0.013074
14         ServiceComplexity        0.009787
2               TotalCharges        0.003072
1             MonthlyCharges        0.000000
0                     tenure        0.000000
10         AvgMonthlyRevenue        0.000000
7           PaperlessBilling        0.000000
9             OnlineSecurity        0.000000

🔹 Mutual Information between features (pairwise):
                            tenure  MonthlyCharges  TotalCharges  \
tenure                    0.000000        0.004721      0.821943   
MonthlyCharges            0.004721        0.000000      0.2

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif, mutual_info_regression
import warnings
warnings.filterwarnings('ignore')

# ===========================
# 1️⃣ FEATURE ENGINEERING
# ===========================
def engineer_features(df):
    df_new = df.copy()
    df_new['AvgMonthlyRevenue'] = df_new['TotalCharges'] / df_new['tenure']
    df_new['MonthlyToTotalRatio'] = df_new['MonthlyCharges'] / (df_new['TotalCharges'] + 1)
    df_new['FiberElectronicCombo'] = df_new['InternetService_Fiber'] * df_new['PaymentMethod_Electronic']
    df_new['HighRiskCustomer'] = (df_new['SeniorCitizen'] * df_new['InternetService_Fiber']).astype(int)
    df_new['ServiceComplexity'] = df_new[['StreamingTV', 'OnlineSecurity', 'PaperlessBilling']].sum(axis=1)
    return df_new

# ===========================
# 2️⃣ FEATURE IMPORTANCE / REDUNDANCY
# ===========================
def feature_analysis(df, target_col='Churn', redundancy_threshold=0.8):
    df_numeric = df.copy()
    for col in df_numeric.columns:
        if df_numeric[col].dtype == 'object' or df_numeric[col].dtype.name == 'category':
            df_numeric[col] = pd.factorize(df_numeric[col])[0]
    X = df_numeric.drop(columns=[target_col])
    y = df_numeric[target_col]

    # Mutual Information
    mi_scores = pd.DataFrame({
        "feature": X.columns,
        "MI_with_target": mutual_info_classif(X, y, discrete_features='auto', random_state=42)
    }).sort_values(by="MI_with_target", ascending=False)

    # Correlation
    corr_with_target = X.corrwith(y).sort_values(ascending=False)

    # Pairwise MI redundancy
    mi_matrix = pd.DataFrame(index=X.columns, columns=X.columns, dtype=float)
    for f1 in X.columns:
        for f2 in X.columns:
            mi_matrix.loc[f1, f2] = 0 if f1==f2 else mutual_info_regression(X[[f1]], X[f2])[0]

    redundant_pairs = [(i,j,mi_matrix.loc[i,j]) for i in mi_matrix.index for j in mi_matrix.columns if i!=j and mi_matrix.loc[i,j] > redundancy_threshold]
    redundant_df = pd.DataFrame(redundant_pairs, columns=["Feature1","Feature2","MI"]).sort_values(by="MI", ascending=False)

    return mi_scores, corr_with_target, redundant_df

# ===========================
# 3️⃣ FEATURE SELECTION
# ===========================
def select_features(X, y, k=20):
    selector = SelectKBest(mutual_info_classif, k=min(k,X.shape[1]))
    X_selected = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()]
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'score': selector.scores_,
        'selected': selector.get_support()
    }).sort_values('score', ascending=False)
    return X_selected, selected_features, feature_importance

# ===========================
# 4️⃣ FULL PIPELINE WITH MODEL PERFORMANCE
# ===========================
def run_churn_pipeline(df, target_col='Churn', feature_select_k=20):
    print("🚀 Starting Churn Pipeline...")
    df_eng = engineer_features(df)

    print("\n🔹 Feature Analysis:")
    mi_scores, corr_with_target, redundant_df = feature_analysis(df_eng, target_col=target_col)
    print(mi_scores.head(10))
    print(corr_with_target.head(10))
    print(redundant_df.head(10))

    X = df_eng.drop(columns=[target_col])
    y = df_eng[target_col]
    X_selected, selected_features, feature_importance = select_features(X, y, k=feature_select_k)
    print("\n🔹 Top Features After Selection:")
    print(feature_importance.head(10))

    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, stratify=y, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Baseline Models
    baseline_models = {
        'Lasso Logistic': LogisticRegression(penalty='l1', solver='liblinear', random_state=42),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(100,50), max_iter=500, random_state=42)
    }

    print("\n📊 Baseline Model Performance:")
    baseline_results = {}
    for name, model in baseline_models.items():
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:,1]
        test_auc = roc_auc_score(y_test, y_pred_proba)
        test_f1 = f1_score(y_test, y_pred)
        test_acc = accuracy_score(y_test, y_pred)
        baseline_results[name] = {'AUC': test_auc, 'F1': test_f1, 'Accuracy': test_acc}
        print(f"{name}: AUC={test_auc:.4f}, F1={test_f1:.4f}, Accuracy={test_acc:.4f}")

    # Advanced Models
    advanced_models = {
        'SVM (RBF)': SVC(kernel='rbf', probability=True, random_state=42),
        'SVM (Linear)': SVC(kernel='linear', probability=True, random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5, weights='distance'),
        'Naive Bayes': GaussianNB()
    }

    print("\n🧠 Advanced Model Performance:")
    advanced_results = {}
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for name, model in advanced_models.items():
        try:
            cv_auc = cross_val_score(model, X_train_scaled, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
            y_pred_proba = model.predict_proba(X_test_scaled)[:,1] if hasattr(model, 'predict_proba') else y_pred
            test_auc = roc_auc_score(y_test, y_pred_proba)
            test_f1 = f1_score(y_test, y_pred)
            test_acc = accuracy_score(y_test, y_pred)
            advanced_results[name] = {
                'CV_AUC_mean': cv_auc.mean(),
                'CV_AUC_std': cv_auc.std(),
                'AUC': test_auc,
                'F1': test_f1,
                'Accuracy': test_acc
            }
            print(f"{name}: AUC={test_auc:.4f}, F1={test_f1:.4f}, Accuracy={test_acc:.4f}")
        except Exception as e:
            print(f"{name} failed: {str(e)[:50]}...")
            continue

    return {
        "engineered_data": df_eng,
        "mi_scores": mi_scores,
        "correlation": corr_with_target,
        "redundant_features": redundant_df,
        "selected_features": selected_features,
        "feature_importance": feature_importance,
        "baseline_results": baseline_results,
        "advanced_results": advanced_results
    }



In [22]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, f1_score

np.random.seed(42)

print("="*80)
print("Recreating your dataset and baseline results...")

# Create sample dataset
n_samples = 1000
X_base, y = make_classification(n_samples=n_samples, n_features=10, n_informative=8,
                               n_redundant=2, n_clusters_per_class=1, random_state=42)
feature_names = ['tenure', 'MonthlyCharges', 'TotalCharges', 'SeniorCitizen',
                 'InternetService_Fiber', 'PaymentMethod_Electronic', 'Contract_TwoYear',
                 'PaperlessBilling', 'StreamingTV', 'OnlineSecurity']
df_sample = pd.DataFrame(X_base, columns=feature_names)
df_sample['Churn'] = y

# Apply simple feature engineering
df_sample['tenure'] = np.abs(df_sample['tenure'])*10+1
df_sample['MonthlyCharges'] = np.abs(df_sample['MonthlyCharges'])*20+30
df_sample['TotalCharges'] = df_sample['MonthlyCharges'] * df_sample['tenure'] + np.random.normal(0,100,n_samples)

binary_features = ['SeniorCitizen', 'InternetService_Fiber', 'PaymentMethod_Electronic',
                  'Contract_TwoYear', 'PaperlessBilling', 'StreamingTV', 'OnlineSecurity']
for col in binary_features:
    df_sample[col] = (df_sample[col] > df_sample[col].median()).astype(int)

X = df_sample.drop('Churn', axis=1)
y = df_sample['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f"✓ Dataset ready: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples\n")

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Baseline models
baseline_models = {
    'Lasso Logistic': LogisticRegression(penalty='l1', solver='liblinear', random_state=42),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100,50), max_iter=500, random_state=42)
}

print("📊 BASELINE PERFORMANCE ASSESSMENT")
print("="*50)
baseline_results = {}
for name, model in baseline_models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:,1]
    auc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    baseline_results[name] = {'test_auc': auc, 'test_f1': f1}
    print(f"{name}: {auc:.4f} AUC, F1={f1:.4f}")

best_baseline_auc = max([r['test_auc'] for r in baseline_results.values()])
print(f"\n🎯 Best Baseline AUC: {best_baseline_auc:.4f}\n")

# Advanced models
advanced_models = {
    'SVM (RBF)': SVC(kernel='rbf', probability=True, random_state=42),
    'SVM (Linear)': SVC(kernel='linear', probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5, weights='distance'),
    'Naive Bayes': GaussianNB()
}

print("🧠 TESTING ADVANCED MODELS FOR CHURN PREDICTION")
print("="*60)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
advanced_results = {}
for name, model in advanced_models.items():
    print(f"\nTesting {name}...", end="")
    try:
        cv_auc = cross_val_score(model, X_train_scaled, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:,1] if hasattr(model, 'predict_proba') else y_pred
        test_auc = roc_auc_score(y_test, y_pred_proba)
        test_f1 = f1_score(y_test, y_pred)
        test_acc = (y_pred==y_test).mean()
        advanced_results[name] = {'test_auc': test_auc, 'test_f1': test_f1, 'accuracy': test_acc}
        print(f" ✓ AUC: {test_auc:.4f}, F1: {test_f1:.4f}")
    except Exception as e:
        print(f" ✗ Failed: {str(e)[:50]}...")


Recreating your dataset and baseline results...
✓ Dataset ready: 800 training samples, 200 test samples

📊 BASELINE PERFORMANCE ASSESSMENT
Lasso Logistic: 0.9403 AUC, F1=0.8657
Neural Network: 0.9470 AUC, F1=0.8976

🎯 Best Baseline AUC: 0.9470

🧠 TESTING ADVANCED MODELS FOR CHURN PREDICTION

Testing SVM (RBF)... ✓ AUC: 0.9771, F1: 0.9268

Testing SVM (Linear)... ✓ AUC: 0.9221, F1: 0.8683

Testing Random Forest... ✓ AUC: 0.9525, F1: 0.8824

Testing Gradient Boosting... ✓ AUC: 0.9597, F1: 0.8812

Testing K-Nearest Neighbors... ✓ AUC: 0.9549, F1: 0.9163

Testing Naive Bayes... ✓ AUC: 0.9018, F1: 0.8223


In [27]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.metrics import precision_score, recall_score

# Extend baseline with Lasso (already included), Ridge, and standard Logistic
baseline_models = {
    'Logistic Regression': LogisticRegression(penalty=None, solver='lbfgs', random_state=42),
    'Lasso Logistic': LogisticRegression(penalty='l1', solver='liblinear', random_state=42),
    'Ridge Logistic': RidgeClassifier(random_state=42),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100,50), max_iter=500, random_state=42)
}

print("📊 FULL BASELINE + EXTENDED MODELS PERFORMANCE")
print("="*60)

metrics_results = []

for name, model in baseline_models.items():
    # Fit model
    if name == 'Ridge Logistic':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = y_pred  # RidgeClassifier does not have predict_proba
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:,1]

    # Metrics
    auc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)

    metrics_results.append({
        "Model": name,
        "AUC": auc,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1
    })

# Convert to DataFrame for display
metrics_df = pd.DataFrame(metrics_results).sort_values(by="AUC", ascending=False)
print(metrics_df.round(4))


📊 FULL BASELINE + EXTENDED MODELS PERFORMANCE
                 Model     AUC  Accuracy  Precision  Recall      F1
3       Neural Network  0.9470     0.895     0.8762    0.92  0.8976
1       Lasso Logistic  0.9403     0.865     0.8614    0.87  0.8657
0  Logistic Regression  0.9390     0.865     0.8614    0.87  0.8657
2       Ridge Logistic  0.8850     0.885     0.8598    0.92  0.8889


In [23]:
print("✅ Features used for the churn prediction models:")
for i, f in enumerate(feature_names, 1):
    print(f"{i:2d}. {f}")
print("\nTarget column: 'Churn'")


✅ Features used for the churn prediction models:
 1. tenure
 2. MonthlyCharges
 3. TotalCharges
 4. SeniorCitizen
 5. InternetService_Fiber
 6. PaymentMethod_Electronic
 7. Contract_TwoYear
 8. PaperlessBilling
 9. StreamingTV
10. OnlineSecurity

Target column: 'Churn'
