In [6]:
# PHASE 1: Data Prep
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Load the dataset
df = pd.read_csv('cleaned_gym_churn_us.csv')

# Drop the index column if it exists
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

# Convert all boolean columns to integers (True=1, False=0)
for col in df.select_dtypes(include='bool').columns:
    df[col] = df[col].astype(int)

# Separate features (X) and target (y)
X = df.drop('Churn', axis=1)
y = df['Churn'] # Target variable

# Split data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Identify numerical columns for scaling
numerical_cols = ['Age', 'Avg_additional_charges_total', 'Month_to_end_contract',
                  'Lifetime', 'Avg_class_frequency_total', 'Avg_class_frequency_current_month']

# Apply Standard Scaling
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

print("Preprocessing complete. Data is ready for modeling.\n")






Preprocessing complete. Data is ready for modeling.



In [7]:

# PHASE 2: Modeling, Evaluation, and Selection


# Dictionary of two to three models to train
models = {
    'Logistic Regression': LogisticRegression(random_state=42, solver='liblinear'),
    'Random Forest Classifier': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42, n_estimators=100)
}

# Dictionary to store AUC scores for comparison
auc_results = {}

print("--- Model Training and Evaluation ---")

for name, model in models.items():
    print(f"\nTraining and Evaluating: {name}")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions and get probabilities
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] # Probability of the positive class (churn=1)

    # 1. Classification Report (Precision, Recall, F1-score)
    print("Classification Report:")
    report = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose().round(4)
    print(report_df.to_markdown())

    # 2. AUC-ROC Score
    auc_score = roc_auc_score(y_test, y_proba)
    auc_results[name] = auc_score
    print(f"AUC-ROC Score: {auc_score:.4f}")
    print("-" * 50)


--- Model Training and Evaluation ---

Training and Evaluating: Logistic Regression
Classification Report:
|              |   precision |   recall |   f1-score |   support |
|:-------------|------------:|---------:|-----------:|----------:|
| 0            |      0.9372 |   0.9649 |     0.9508 |  882      |
| 1            |      0.8938 |   0.8208 |     0.8557 |  318      |
| accuracy     |      0.9267 |   0.9267 |     0.9267 |    0.9267 |
| macro avg    |      0.9155 |   0.8928 |     0.9033 | 1200      |
| weighted avg |      0.9257 |   0.9267 |     0.9256 | 1200      |
AUC-ROC Score: 0.9745
--------------------------------------------------

Training and Evaluating: Random Forest Classifier
Classification Report:
|              |   precision |   recall |   f1-score |   support |
|:-------------|------------:|---------:|-----------:|----------:|
| 0            |      0.9276 |   0.958  |     0.9426 |  882      |
| 1            |      0.872  |   0.7925 |     0.8303 |  318      |
| accurac

In [8]:

# PHASE 3: Identify the Best Model

print("\n--- Model Comparison (Based on AUC-ROC) ---")
auc_series = pd.Series(auc_results).sort_values(ascending=False)
best_model_name = auc_series.index[0]
print(auc_series.to_markdown())

print(f"\n**Conclusion:** The **{best_model_name}** is the best model for this task, achieving the highest AUC-ROC score of **{auc_series.iloc[0]:.4f}**.")


--- Model Comparison (Based on AUC-ROC) ---
|                              |        0 |
|:-----------------------------|---------:|
| Gradient Boosting Classifier | 0.977053 |
| Logistic Regression          | 0.974522 |
| Random Forest Classifier     | 0.967823 |

**Conclusion:** The **Gradient Boosting Classifier** is the best model for this task, achieving the highest AUC-ROC score of **0.9771**.


In [9]:

# PHASE 4: Feature Importance for the Best Model


# Feature Importance is key for business understanding
if best_model_name in ['Random Forest Classifier', 'Gradient Boosting Classifier']:
    best_model = models[best_model_name]
    print(f"\n--- Feature Importance for the Best Model ({best_model_name}) ---")

    feature_importance = pd.Series(
        best_model.feature_importances_,
        index=X_train.columns
    ).sort_values(ascending=False)
    print(feature_importance.head(10).to_markdown())
elif best_model_name == 'Logistic Regression':
    best_model = models[best_model_name]
    print(f"\n--- Top 10 Features (Coefficients) for the Best Model ({best_model_name}) ---")
    coef_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Coefficient': best_model.coef_[0]
    })
    # Rank by absolute magnitude of coefficient
    coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
    top_10_features = coef_df.sort_values(by='Abs_Coefficient', ascending=False).drop(columns='Abs_Coefficient').head(10)
    print(top_10_features.to_markdown(index=False))


--- Feature Importance for the Best Model (Gradient Boosting Classifier) ---
|                                   |           0 |
|:----------------------------------|------------:|
| Lifetime                          | 0.453882    |
| Avg_class_frequency_current_month | 0.16207     |
| Age                               | 0.124004    |
| Avg_class_frequency_total         | 0.111115    |
| Month_to_end_contract             | 0.10683     |
| Avg_additional_charges_total      | 0.0201832   |
| Contract_period                   | 0.0143721   |
| Group_visits                      | 0.00658801  |
| Promo_friends                     | 0.000830871 |
| gender                            | 0.000125179 |
