In [2]:
# 03_data_preprocessing_and_feature_engineering.ipynb

import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.dropna(inplace=True)
train.drop('CustomerID', axis=1, inplace=True)
test.drop('CustomerID', axis=1, inplace=True)

# ----------------- Encode Basic Columns -----------------
train['Gender'] = train['Gender'].map({'Male':0, 'Female':1})
test['Gender'] = test['Gender'].map({'Male':0, 'Female':1})

train = pd.get_dummies(train, columns=['Subscription Type','Contract Length'], drop_first=True)
test = pd.get_dummies(test, columns=['Subscription Type','Contract Length'], drop_first=True)

# ----------------- Add missing indicator dummies -----------------
train['Subscription Type_Basic'] = ((train.filter(regex='Subscription Type').sum(axis=1) == 0)).astype(int)
test['Subscription Type_Basic'] = ((test.filter(regex='Subscription Type').sum(axis=1) == 0)).astype(int)

train['Contract Length_Annual'] = ((train.filter(regex='Contract Length').sum(axis=1) == 0)).astype(int)
test['Contract Length_Annual'] = ((test.filter(regex='Contract Length').sum(axis=1) == 0)).astype(int)

## ----------------- Feature Engineering -----------------
for name, df in [('train', train), ('test', test)]:

    df['Avg_Monthly_Spend'] = df['Total Spend'] / df['Tenure'].replace(0, 1)
    df['Support_Intensity'] = df['Support Calls'] / df['Usage Frequency'].replace(0, 1)
    df['Recency_Tenure_Ratio'] = df['Last Interaction'] / df['Tenure'].replace(0, 1)

    # Create Age_Group
    df['Age_Group'] = pd.cut(
        df['Age'],
        bins=[17, 25, 35, 45, 55, 100],
        labels=['18-25', '26-35', '36-45', '46-55', '55+']
    )

# NOW apply dummies correctly on the REAL dataframes
train = pd.get_dummies(train, columns=['Age_Group'], drop_first=True)
test = pd.get_dummies(test, columns=['Age_Group'], drop_first=True)

# Save processed data
train.to_csv("train_processed.csv", index=False)
test.to_csv("test_processed.csv", index=False)
