In [8]:
#importing data 
import pandas as pd

# Load the dataset
df = pd.read_csv('gym_churn_us.csv')

print("Original DataFrame head:")
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

print("\nOriginal DataFrame info:")
print(df.info())

Original DataFrame head:
| gender   | Near_Location   | Partner   | Promo_friends   | Phone   | Contract_period   | Group_visits   | Age   | Avg_additional_charges_total   | Month_to_end_contract   | Lifetime   | Avg_class_frequency_total   | Avg_class_frequency_current_month   | Churn   |
|:---------|:----------------|:----------|:----------------|:--------|:------------------|:---------------|:------|:-------------------------------|:------------------------|:-----------|:----------------------------|:------------------------------------|:--------|
| 1        | 1               | 1         | 1               | 0       | 6                 | 1              | 29    | 14.2275                        | 5                       | 3          | 0.0203979                   | 0                                   | 0       |
| 0        | 1               | 0         | 0               | 1       | 12                | 1              | 31    | 113.203                        | 12                      | 7 

In [9]:
# Check for missing values (if any)
print("\nMissing values per column:")
print(df.isnull().sum().to_markdown(numalign="left", stralign="left"))


Missing values per column:
|                                   | 0   |
|:----------------------------------|:----|
| gender                            | 0   |
| Near_Location                     | 0   |
| Partner                           | 0   |
| Promo_friends                     | 0   |
| Phone                             | 0   |
| Contract_period                   | 0   |
| Group_visits                      | 0   |
| Age                               | 0   |
| Avg_additional_charges_total      | 0   |
| Month_to_end_contract             | 0   |
| Lifetime                          | 0   |
| Avg_class_frequency_total         | 0   |
| Avg_class_frequency_current_month | 0   |
| Churn                             | 0   |


In [10]:
# List of columns that are binary (0 or 1) and can be converted to boolean
boolean_columns = [
    'gender',
    'Near_Location',
    'Partner',
    'Promo_friends',
    'Phone',
    'Group_visits',
    'Churn'
]

# Convert identified columns to boolean type
for col in boolean_columns:
    df[col] = df[col].astype(bool)

print("\nDataFrame head after boolean conversion:")
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

print("\nDataFrame info after boolean conversion:")
print(df.info())


DataFrame head after boolean conversion:
| gender   | Near_Location   | Partner   | Promo_friends   | Phone   | Contract_period   | Group_visits   | Age   | Avg_additional_charges_total   | Month_to_end_contract   | Lifetime   | Avg_class_frequency_total   | Avg_class_frequency_current_month   | Churn   |
|:---------|:----------------|:----------|:----------------|:--------|:------------------|:---------------|:------|:-------------------------------|:------------------------|:-----------|:----------------------------|:------------------------------------|:--------|
| True     | True            | True      | True            | False   | 6                 | True           | 29    | 14.2275                        | 5                       | 3          | 0.0203979                   | 0                                   | False   |
| False    | True            | False     | False           | True    | 12                | True           | 31    | 113.203                        | 12         

In [11]:
print(df.head(5))

   gender  Near_Location  Partner  Promo_friends  Phone  Contract_period  \
0    True           True     True           True  False                6   
1   False           True    False          False   True               12   
2   False           True     True          False   True                1   
3   False           True     True           True   True               12   
4    True           True     True           True   True                1   

   Group_visits  Age  Avg_additional_charges_total  Month_to_end_contract  \
0          True   29                     14.227470                    5.0   
1          True   31                    113.202938                   12.0   
2         False   28                    129.448479                    1.0   
3          True   33                     62.669863                   12.0   
4         False   26                    198.362265                    1.0   

   Lifetime  Avg_class_frequency_total  Avg_class_frequency_current_month  \
0  

In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define features (X) and target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# Identify features to be one-hot encoded
categorical_features_to_encode = ['Contract_period', 'Month_to_end_contract']

# Create a ColumnTransformer to apply OneHotEncoder
# 'remainder="passthrough"' keeps other columns (numerical and already-boolean) as they are
preprocessor_encoding = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_to_encode)
    ],
    remainder='passthrough'
)

# Apply the encoding
X_encoded = preprocessor_encoding.fit_transform(X)

# Get feature names after one-hot encoding for clarity
ohe_feature_names = preprocessor_encoding.named_transformers_['cat'].get_feature_names_out(categorical_features_to_encode)
remaining_features = [col for col in X.columns if col not in categorical_features_to_encode]
all_feature_names_after_encoding = list(ohe_feature_names) + remaining_features

# Convert the processed array back to a DataFrame for better readability
X_encoded_df = pd.DataFrame(X_encoded, columns=all_feature_names_after_encoding, index=X.index)

print("\nFirst 5 rows of X after one-hot encoding of Contract_period and Month_to_end_contract:")
print(X_encoded_df.head().to_markdown(numalign="left", stralign="left"))


First 5 rows of X after one-hot encoding of Contract_period and Month_to_end_contract:
|    | Contract_period_1   | Contract_period_6   | Contract_period_12   | Month_to_end_contract_1.0   | Month_to_end_contract_2.0   | Month_to_end_contract_3.0   | Month_to_end_contract_4.0   | Month_to_end_contract_5.0   | Month_to_end_contract_6.0   | Month_to_end_contract_7.0   | Month_to_end_contract_8.0   | Month_to_end_contract_9.0   | Month_to_end_contract_10.0   | Month_to_end_contract_11.0   | Month_to_end_contract_12.0   | gender   | Near_Location   | Partner   | Promo_friends   | Phone   | Group_visits   | Age   | Avg_additional_charges_total   | Lifetime   | Avg_class_frequency_total   | Avg_class_frequency_current_month   |
|:---|:--------------------|:--------------------|:---------------------|:----------------------------|:----------------------------|:----------------------------|:----------------------------|:----------------------------|:----------------------------|:-------------

In [16]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets (80% train, 20% test)
# 'stratify=y' ensures that the proportion of churned/non-churned customers is maintained in both sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded_df, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nShape of training features (X_train): {X_train.shape}")
print(f"Shape of testing features (X_test): {X_test.shape}")
print(f"Shape of training target (y_train): {y_train.shape}")
print(f"Shape of testing target (y_test): {y_test.shape}")


Shape of training features (X_train): (3200, 26)
Shape of testing features (X_test): (800, 26)
Shape of training target (y_train): (3200,)
Shape of testing target (y_test): (800,)


In [17]:
from sklearn.preprocessing import StandardScaler

# Identify numerical features that need scaling (excluding the one-hot encoded and boolean columns)
# This assumes the order of columns after ColumnTransformer: one-hot encoded first, then original numerical/boolean
numerical_features_after_encoding = [col for col in all_feature_names_after_encoding if col not in ohe_feature_names]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler ONLY on the training data and transform both training and test sets
# Ensure to select only the numerical columns for scaling
X_train[numerical_features_after_encoding] = scaler.fit_transform(X_train[numerical_features_after_encoding])
X_test[numerical_features_after_encoding] = scaler.transform(X_test[numerical_features_after_encoding])

print("\nFirst 5 rows of X_train after scaling (numerical columns only):")
print(X_train.head().to_markdown(numalign="left", stralign="left"))

print("\nDescriptive statistics of X_train after scaling (numerical columns only):")
print(X_train[numerical_features_after_encoding].describe().to_markdown(numalign="left", stralign="left"))


First 5 rows of X_train after scaling (numerical columns only):
|      | Contract_period_1   | Contract_period_6   | Contract_period_12   | Month_to_end_contract_1.0   | Month_to_end_contract_2.0   | Month_to_end_contract_3.0   | Month_to_end_contract_4.0   | Month_to_end_contract_5.0   | Month_to_end_contract_6.0   | Month_to_end_contract_7.0   | Month_to_end_contract_8.0   | Month_to_end_contract_9.0   | Month_to_end_contract_10.0   | Month_to_end_contract_11.0   | Month_to_end_contract_12.0   | gender   | Near_Location   | Partner   | Promo_friends   | Phone    | Group_visits   | Age       | Avg_additional_charges_total   | Lifetime   | Avg_class_frequency_total   | Avg_class_frequency_current_month   |
|:-----|:--------------------|:--------------------|:---------------------|:----------------------------|:----------------------------|:----------------------------|:----------------------------|:----------------------------|:----------------------------|:---------------------------

In [19]:
#These comprehensive data wrangling steps ensure that your data is well-prepared and optimized for training a machine learning model to predict churn.