NameError: name 'X' is not defined

In [2]:
# %% [markdown]
# # Feature Engineering for Customer Churn Prediction

# %%
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
import sys

# Add utils directory to path
sys.path.append(os.path.abspath('../utils'))
from data_preprocessing import preprocess_data

# %%
# Load raw data
data_path = '../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(data_path)

# %%
# Preprocess data using our utility function
X, y = preprocess_data(df)

# %%
# Feature Engineering - Create new features
X['TotalSpend'] = X['MonthlyCharges'] * X['tenure']
X['AvgMonthlySpend'] = X['TotalCharges'] / (X['tenure'] + 1)  # +1 to avoid division by zero

# %%
# Feature Engineering - Binning numerical features
X['tenure_group'] = pd.cut(X['tenure'], bins=[0, 12, 24, 48, 72, np.inf], 
                          labels=['0-1yr', '1-2yr', '2-4yr', '4-6yr', '6+yr'])

# %%
# Convert bins to dummy variables
tenure_dummies = pd.get_dummies(X['tenure_group'], prefix='tenure', drop_first=True)
X = pd.concat([X, tenure_dummies], axis=1)
X.drop('tenure_group', axis=1, inplace=True)

# %%
# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# %%
# Scale numerical features (excluding dummy variables)
scaler = StandardScaler()
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 'TotalSpend', 'AvgMonthlySpend']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# %%
# Save processed data
os.makedirs('../data/processed', exist_ok=True)
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

# %%
# Verify saved files
print("Processed data shapes:")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(0, inplace=True)


Processed data shapes:
X_train: (5634, 36)
X_test: (1409, 36)
y_train: (5634,)
y_test: (1409,)
