In [None]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:

df = pd.read_csv('../data/Cleaned-dataset.csv')
df.head()

In [None]:
# Separate feature and target
x = df.drop('Churn', axis=1)
y = df['Churn']

In [None]:
# Identify feature types
num_features = x.select_dtypes(exclude='object').columns
cat_feature  = x.select_dtypes(include='object').columns
cat_feature

In [None]:
# Separate binary and multi-category columns
binary_category = []
multi_category  = []

for feature in cat_feature:
    
    if len(x[feature].unique()) == 2:
        binary_category.append(feature)
    else:
        multi_category.append(feature)

print('Binary:', binary_category)
print('Multi: ', multi_category)

In [None]:

for col in binary_category:
    if col == 'gender':
        continue  # handled by get_dummies below
    else:
        x[col] = x[col].map({'Yes': 1, 'No': 0})  # âœ… x[col] not df[col]

x.head()

In [None]:

x = pd.get_dummies(
    x,
    columns=['gender', 'InternetService', 'Contract', 'PaymentMethod'],
    drop_first=True
)

# Convert bool columns to int
bool_cols = x.select_dtypes(include='bool').columns
x[bool_cols] = x[bool_cols].astype(int)

print('Features after encoding:')
for i, col in enumerate(x.columns, 1):
    print(f'  {i}. {col}')
print(f'\nTotal: {len(x.columns)} features')

In [None]:
# Save feature names for the app
feature_names = x.columns.tolist()
with open('../features.pkl', 'wb') as f:
    pickle.dump(feature_names, f)
print(' features.pkl saved:', feature_names)

In [None]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)
print('x_train:', x_train.shape)
print('x_test: ', x_test.shape)

In [None]:
# Scale numeric features
scaler = StandardScaler()
x_train[num_features] = scaler.fit_transform(x_train[num_features])
x_test[num_features]  = scaler.transform(x_test[num_features])

#  Save scaler so app can scale new inputs!
with open('../scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print(' scaler.pkl saved')

In [None]:
print('Train NaN:', x_train.isna().sum().sum())
print('Test NaN: ', x_test.isna().sum().sum())
print('x_train shape:', x_train.shape)
print('x_test shape: ', x_test.shape)

In [None]:
# Save encoded x (not df!) and use relative path
x.to_csv('../data/feature-engineering.csv', index=False)
print(' feature-engineering.csv saved')
print('\n Feature Engineering complete!')