In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

def preprocess_data(df):
    # Separate features and target
    X = df.drop(columns=['class'])
    y = df['class']

    # Split into train/test first (important to avoid leakage)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Handle class imbalance ONLY on training data
    print("Original class distribution in train:", y_train.value_counts())
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    print("Resampled class distribution in train:", y_train_resampled.value_counts())

    # Numerical columns to scale (example: adjust to your dataset)
    numeric_cols = ['purchase_value', 'age', 'time_since_signup', 'time_diff_seconds']
    
    scaler = StandardScaler()
    X_train_resampled[numeric_cols] = scaler.fit_transform(X_train_resampled[numeric_cols])
    X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

    # Categorical columns to encode
    categorical_cols = ['source', 'browser', 'sex']

    # One-hot encode training and test categorical features
    X_train_encoded = pd.get_dummies(X_train_resampled, columns=categorical_cols, drop_first=True)
    X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

    # Align columns of train and test (some categories might be missing in test)
    X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

    return X_train_encoded, X_test_encoded, y_train_resampled, y_test

# Example usage after feature engineering:
# X_train, X_test, y_train, y_test = preprocess_data(fraud_df)


ModuleNotFoundError: No module named 'imblearn'