In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


df = pd.read_csv("cleaned_gym_churn_us.csv")

# ====================================================================
# Data Cleaning (Removing unnecessary index column)
# ====================================================================
df_processed = df.drop('Unnamed: 0', axis=1)

# Note on "Creating dummy features":
# The boolean columns in this dataset (gender, Near_Location, Partner, etc.)
# are already in a dummy/encoded format (True/False) and thus satisfy the rubric requirement.
# No pd.get_dummies() is required.








In [5]:
# 1. Split data into training and testing subsets

# Define target variable (y) and features (X)
y = df_processed['Churn']
X = df_processed.drop('Churn', axis=1)

# Split the data (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,         # 30% for testing
    random_state=42,       # For reproducibility
    stratify=y             # Maintains the Churn class balance in both sets
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")


X_train shape: (2800, 13)
X_test shape: (1200, 13)


In [6]:
# 2. Scale standardization

# Identify numerical columns for standardization
numerical_cols = [
    'Contract_period',
    'Age',
    'Avg_additional_charges_total',
    'Month_to_end_contract',
    'Lifetime',
    'Avg_class_frequency_total',
    'Avg_class_frequency_current_month'
]

# Separate numerical data for scaling
X_train_num = X_train[numerical_cols]
X_test_num = X_test[numerical_cols]

# Initialize the StandardScaler
scaler = StandardScaler()


In [7]:
# 1. Fit the scaler ONLY on the training data 
scaler.fit(X_train_num)

# 2. Transform both the training and testing data
X_train_scaled_array = scaler.transform(X_train_num)
X_test_scaled_array = scaler.transform(X_test_num)

# Convert the scaled arrays back to DataFrames
X_train_scaled = pd.DataFrame(
    X_train_scaled_array,
    columns=numerical_cols,
    index=X_train_num.index # Preserve original index
)
X_test_scaled = pd.DataFrame(
    X_test_scaled_array,
    columns=numerical_cols,
    index=X_test_num.index
)


In [8]:
# Re-assemble the final training and testing datasets
# Drop the original numerical columns, then concatenate the scaled ones
X_train_final = X_train.drop(columns=numerical_cols).copy()
X_test_final = X_test.drop(columns=numerical_cols).copy()

X_train_final = pd.concat([X_train_final, X_train_scaled], axis=1)
X_test_final = pd.concat([X_test_final, X_test_scaled], axis=1)

print("\nStandardization Complete. Data is ready for model training.")
print(f"Final X_train columns: {X_train_final.columns.tolist()}")


Standardization Complete. Data is ready for model training.
Final X_train columns: ['gender', 'Near_Location', 'Partner', 'Promo_friends', 'Phone', 'Group_visits', 'Contract_period', 'Age', 'Avg_additional_charges_total', 'Month_to_end_contract', 'Lifetime', 'Avg_class_frequency_total', 'Avg_class_frequency_current_month']
