In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data with selected columns
df = pd.read_csv('gym_churn_us.csv', usecols=[
    'gender', 'Near_Location', 'Partner', 'Promo_friends', 'Phone',
    'Contract_period', 'Group_visits', 'Age', 'Avg_additional_charges_total',
    'Month_to_end_contract', 'Lifetime', 'Avg_class_frequency_total',
    'Avg_class_frequency_current_month', 'Churn'
])

# Handle missing values if necessary (here filling with mean for numerical columns)
df.fillna(df.mean(), inplace=True)

# Convert categorical columns to dummy variables
df = pd.get_dummies(df, drop_first=True)

# Split data into features (X) and target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Train a random forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"ROC AUC: {roc_auc}")
print(classification_report(y_test, y_pred))


Accuracy: 0.9141666666666667
ROC AUC: 0.871557557093502
              precision    recall  f1-score   support

           0       0.93      0.96      0.94       897
           1       0.86      0.79      0.82       303

    accuracy                           0.91      1200
   macro avg       0.90      0.87      0.88      1200
weighted avg       0.91      0.91      0.91      1200



In [3]:
from sklearn.model_selection import GridSearchCV

# Define a grid of hyperparameters
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}

# Set up grid search
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy', verbose=3)
grid_search.fit(X_train_scaled, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV 1/3] END max_depth=None, min_samples_split=2, n_estimators=100;, score=0.910 total time=   2.0s
[CV 2/3] END max_depth=None, min_samples_split=2, n_estimators=100;, score=0.925 total time=   1.3s
[CV 3/3] END max_depth=None, min_samples_split=2, n_estimators=100;, score=0.909 total time=   1.3s
[CV 1/3] END max_depth=None, min_samples_split=2, n_estimators=200;, score=0.914 total time=   2.8s
[CV 2/3] END max_depth=None, min_samples_split=2, n_estimators=200;, score=0.930 total time=   2.8s
[CV 3/3] END max_depth=None, min_samples_split=2, n_estimators=200;, score=0.908 total time=   2.7s
[CV 1/3] END max_depth=None, min_samples_split=5, n_estimators=100;, score=0.911 total time=   1.3s
[CV 2/3] END max_depth=None, min_samples_split=5, n_estimators=100;, score=0.925 total time=   1.3s
[CV 3/3] END max_depth=None, min_samples_split=5, n_estimators=100;, score=0.910 total time=   1.8s
[CV 1/3] END max_depth=None, min_samples

In [4]:
# Feature importance in Random Forest
feature_importance = model.feature_importances_
features = X.columns

# Print the importance of each feature
for feature, importance in zip(features, feature_importance):
    print(f"{feature}: {importance}")

gender: 0.011145441891131631
Near_Location: 0.010750289176833572
Partner: 0.012039458047032204
Promo_friends: 0.011125006841781377
Phone: 0.006728057257887894
Contract_period: 0.06653511937954595
Group_visits: 0.015828532982270906
Age: 0.13518884872414122
Avg_additional_charges_total: 0.08254894733618195
Month_to_end_contract: 0.08080831168655343
Lifetime: 0.2766897073752608
Avg_class_frequency_total: 0.12398645170522567
Avg_class_frequency_current_month: 0.16662582759615344


In [5]:
import joblib

# Save the model to a file
joblib.dump(model, 'gym_churn_model.pkl')

['gym_churn_model.pkl']

In [7]:
import pandas as pd

# Assuming you have these variables from the previous analysis
# - X_test: the test dataset features
# - y_test: the actual labels (ground truth)
# - y_pred: the predicted values
# - feature_importance: the feature importance values
# - features: the feature names

# Convert predictions to DataFrame
df_predictions = pd.DataFrame(X_test, columns=features)
df_predictions['Actual_Churn'] = y_test
df_predictions['Predicted_Churn'] = y_pred

# If you want to include feature importance (optional)
df_feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

# Now you can combine all relevant information into a final DataFrame
final_df = df_predictions.copy()

# Save the final DataFrame to a CSV for Tableau
final_df.to_csv('final_gym_churn_analysis.csv', index=False)