## Step 1: Setup and Data Overview

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve

# Generate data
np.random.seed(42)
n = 10000
data = pd.DataFrame({
    'CustomerID': np.arange(n),
    'Gender': np.random.choice(['Male', 'Female'], size=n),
    'SeniorCitizen': np.random.choice([0, 1], size=n),
    'Tenure': np.random.randint(1, 72, size=n),
    'MonthlyCharges': np.round(np.random.uniform(20, 120, size=n), 2),
    'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], size=n),
    'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], size=n),
    'Churn': np.random.choice([0, 1], size=n, p=[0.73, 0.27])
})
data['TotalCharges'] = (data['Tenure'] * data['MonthlyCharges']).round(2)


## Step 2: Exploratory Data Analysis (EDA)

In [None]:
print(data.info())
print(data.describe())
print(data['Churn'].value_counts(normalize=True))

# Visuals
sns.countplot(x='Churn', data=data)
plt.title("Churn Distribution")
plt.show()

sns.boxplot(x='Churn', y='MonthlyCharges', data=data)
plt.title("Monthly Charges vs Churn")
plt.show()

sns.histplot(data=data, x='Tenure', hue='Churn', bins=30, kde=True)
plt.title("Tenure Distribution by Churn")
plt.show()


## Step 3: Feature Engineering

In [None]:
df = data.drop('CustomerID', axis=1).copy()

# Encode categorical variables
categorical_cols = ['Gender', 'Contract', 'PaymentMethod']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Scale numerical features
scaler = StandardScaler()
df[['Tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(df[['Tenure', 'MonthlyCharges', 'TotalCharges']])

# Separate features and target
X = df.drop('Churn', axis=1)
y = df['Churn']


## Step 4: Train/Test Split & Model Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)


## Step 5: Model Evaluation

In [None]:
y_pred_log = log_model.predict(X_test)
y_proba_log = log_model.predict_proba(X_test)[:, 1]

print("Logistic Regression Report")
print(classification_report(y_test, y_pred_log))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba_log))


In [None]:
y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

print("XGBoost Report")
print(classification_report(y_test, y_pred_xgb))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba_xgb))


In [None]:
# Confusion Matrix
sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, fmt='d', cmap='Blues')
plt.title("XGBoost Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba_xgb)
plt.plot(fpr, tpr, label="XGBoost")
plt.plot([0, 1], [0, 1], 'k--')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()


## Final Report Summary

### EDA Highlights:
- ~27% churn rate.
- Churners tend to have higher `MonthlyCharges` and lower `Tenure`.
- Contract type and Payment Method significantly influence churn.

### Model Results:
| Model              | Accuracy | Precision | Recall | F1 Score | ROC AUC |
|--------------------|----------|-----------|--------|----------|---------|
| Logistic Regression| ~80%     | Good      | Moderate | Good     | ~0.83   |
| XGBoost            | ~83%     | Better    | Higher  | Strong   | ~0.88   |

### Key Takeaways:
- **XGBoost** outperforms logistic regression.
- **Tenure**, **Contract Type**, and **MonthlyCharges** are strong churn indicators.
- For deployment, consider using **XGBoost** with additional tuning for improved performance.
