In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import joblib
import warnings
warnings.filterwarnings('ignore')


plt.style.use('default')
sns.set_palette("husl")

print("=" * 60)
print("CUSTOMER CHURN PREDICTION - ML PIPELINE")
print("=" * 60)


print("\n1. DATA LOADING")
print("-" * 30)

from google.colab import files
import io

print("Please upload your Telco Churn dataset (CSV file):")
uploaded = files.upload()


filename = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[filename]))

print(f"Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

print("\n2. EXPLORATORY DATA ANALYSIS")
print("-" * 30)


print("\nDataset Info:")
print(df.info())

print("\nFirst few rows:")
print(df.head())

print("\nMissing values:")
print(df.isnull().sum())

print("\nTarget variable distribution:")
if 'Churn' in df.columns:
    target_col = 'Churn'
elif 'churn' in df.columns:
    target_col = 'churn'
else:

    churn_cols = [col for col in df.columns if 'churn' in col.lower()]
    if churn_cols:
        target_col = churn_cols[0]
    else:
        target_col = df.columns[-1]

print(f"Target column identified: {target_col}")
print(df[target_col].value_counts())


plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
df[target_col].value_counts().plot(kind='bar')
plt.title('Churn Distribution')
plt.xlabel('Churn')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
df[target_col].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Churn Distribution (%)')

plt.tight_layout()
plt.show()


print("\n3. DATA PREPROCESSING SETUP")
print("-" * 30)


df_processed = df.copy()


if 'TotalCharges' in df_processed.columns:
    df_processed['TotalCharges'] = pd.to_numeric(df_processed['TotalCharges'], errors='coerce')
    df_processed['TotalCharges'].fillna(df_processed['TotalCharges'].median(), inplace=True)


numeric_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numeric_cols:
    numeric_cols.remove(target_col)

categorical_cols = df_processed.select_dtypes(include=['object']).columns.tolist()
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

print(f"Numeric columns: {numeric_cols}")
print(f"Categorical columns: {categorical_cols}")


id_columns = ['customerID', 'customer_id', 'id', 'Customer_ID']
for col in id_columns:
    if col in df_processed.columns:
        df_processed = df_processed.drop(col, axis=1)
        if col in categorical_cols:
            categorical_cols.remove(col)
        print(f"Removed ID column: {col}")


X = df_processed.drop(target_col, axis=1)
y = df_processed[target_col]


if y.dtype == 'object':
    le_target = LabelEncoder()
    y = le_target.fit_transform(y)
    print(f"Target encoded: {dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))}")

print(f"\nFinal feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")


print("\n4. PIPELINE CONSTRUCTION")
print("-" * 30)


numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

print("Preprocessing pipeline created:")
print("- Numeric features: StandardScaler")
print("- Categorical features: OneHotEncoder")


print("\n5. MODEL PIPELINE CREATION")
print("-" * 30)


logistic_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])

print("Model pipelines created:")
print("1. Logistic Regression Pipeline")
print("2. Random Forest Pipeline")


print("\n6. TRAIN-TEST SPLIT")
print("-" * 30)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Training set churn rate: {y_train.mean():.3f}")
print(f"Test set churn rate: {y_test.mean():.3f}")


print("\n7. HYPERPARAMETER TUNING")
print("-" * 30)


logistic_param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear']
}

rf_param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

print("Performing GridSearchCV for Logistic Regression...")
logistic_grid = GridSearchCV(
    logistic_pipeline,
    logistic_param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)
logistic_grid.fit(X_train, y_train)

print("\nPerforming GridSearchCV for Random Forest...")
rf_grid = GridSearchCV(
    rf_pipeline,
    rf_param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)
rf_grid.fit(X_train, y_train)

print("\nBest parameters found:")
print("Logistic Regression:")
print(logistic_grid.best_params_)
print(f"Best CV Score: {logistic_grid.best_score_:.4f}")

print("\nRandom Forest:")
print(rf_grid.best_params_)
print(f"Best CV Score: {rf_grid.best_score_:.4f}")


print("\n8. MODEL EVALUATION")
print("-" * 30)


best_logistic = logistic_grid.best_estimator_
best_rf = rf_grid.best_estimator_


models = {
    'Logistic Regression': best_logistic,
    'Random Forest': best_rf
}

results = {}

for name, model in models.items():

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]


    auc_score = roc_auc_score(y_test, y_pred_proba)

    results[name] = {
        'model': model,
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'auc_score': auc_score
    }

    print(f"\n{name} Results:")
    print(f"AUC Score: {auc_score:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))


print("\n9. RESULTS VISUALIZATION")
print("-" * 30)


plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
for name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['probabilities'])
    plt.plot(fpr, tpr, label=f"{name} (AUC = {result['auc_score']:.3f})")

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
plt.grid(True, alpha=0.3)


for i, (name, result) in enumerate(results.items()):
    plt.subplot(1, 3, i+2)
    cm = confusion_matrix(y_test, result['predictions'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{name}\nConfusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')

plt.tight_layout()
plt.show()


print("\n10. FINAL MODEL SELECTION")
print("-" * 30)


best_model_name = max(results.keys(), key=lambda x: results[x]['auc_score'])
best_model = results[best_model_name]['model']

print(f"Best model selected: {best_model_name}")
print(f"Best AUC Score: {results[best_model_name]['auc_score']:.4f}")


print("\n11. CROSS-VALIDATION")
print("-" * 30)

cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='roc_auc')
print(f"Cross-validation AUC scores: {cv_scores}")
print(f"Mean CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")


print("\n12. FEATURE IMPORTANCE")
print("-" * 30)

if best_model_name == 'Random Forest':

    feature_names = (numeric_cols +
                    list(best_model.named_steps['preprocessor']
                        .named_transformers_['cat']
                        .get_feature_names_out(categorical_cols)))

    importances = best_model.named_steps['classifier'].feature_importances_
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)

    print("Top 10 Most Important Features:")
    print(feature_importance_df.head(10))


    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance_df.head(15), x='importance', y='feature')
    plt.title('Top 15 Feature Importances')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()

print("\n13. PIPELINE EXPORT")
print("-" * 30)


pipeline_filename = 'churn_prediction_pipeline.pkl'
joblib.dump(best_model, pipeline_filename)
print(f"Pipeline saved as: {pipeline_filename}")


metadata = {
    'model_name': best_model_name,
    'auc_score': results[best_model_name]['auc_score'],
    'feature_columns': list(X.columns),
    'target_column': target_col,
    'numeric_columns': numeric_cols,
    'categorical_columns': categorical_cols,
    'model_params': best_model.get_params()
}

metadata_filename = 'pipeline_metadata.pkl'
joblib.dump(metadata, metadata_filename)
print(f"Metadata saved as: {metadata_filename}")


print("\n14. PIPELINE USAGE EXAMPLE")
print("-" * 30)

print("Loading and using the saved pipeline:")
print("""
# Load the pipeline
loaded_pipeline = joblib.load('churn_prediction_pipeline.pkl')
loaded_metadata = joblib.load('pipeline_metadata.pkl')

# Make predictions on new data
# new_data should have the same structure as training data
# predictions = loaded_pipeline.predict(new_data)
# probabilities = loaded_pipeline.predict_proba(new_data)[:, 1]
""")

sample_data = X_test.head(3)
sample_predictions = best_model.predict(sample_data)
sample_probabilities = best_model.predict_proba(sample_data)[:, 1]

print("Sample predictions on test data:")
for i in range(len(sample_data)):
    print(f"Sample {i+1}: Prediction = {sample_predictions[i]}, Probability = {sample_probabilities[i]:.3f}")

print("\n" + "=" * 60)
print("PIPELINE CONSTRUCTION COMPLETE!")
print("=" * 60)
print(f"Best Model: {best_model_name}")
print(f"AUC Score: {results[best_model_name]['auc_score']:.4f}")
print("Files saved:")
print("- churn_prediction_pipeline.pkl")
print("- pipeline_metadata.pkl")
print("=" * 60)