#### Preprocess the data for modelling

In [4]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))

## Import the preprocessing function
from src.data_preprocessing import preprocess_data

In [5]:
# Step 1: Preprocess the training data
train_data_path = '../data/training.csv'
(X_train, y_train, X_test, y_test, feature_names, preprocessor) = preprocess_data(train_data_path, is_training=True)

In [6]:
# Step 2: Load and preprocess the test data (for prediction) using the fitted preprocessor
test_data = pd.read_csv('../data/test.csv')

# Apply the same preprocessing steps as in preprocess_fraud_data, but use the fitted preprocessor
test_data = test_data.drop(columns=['Value'])
drop_cols = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId']
test_data = test_data.drop(columns=drop_cols)

# Feature Engineering
test_data['TransactionStartTime'] = pd.to_datetime(test_data['TransactionStartTime'])
test_data['hour'] = test_data['TransactionStartTime'].dt.hour
test_data['day_of_week'] = test_data['TransactionStartTime'].dt.dayofweek
test_data['month'] = test_data['TransactionStartTime'].dt.month
test_data = test_data.drop(columns=['TransactionStartTime'])

test_data['log_amount'] = np.log1p(test_data['Amount'].abs() + 1)
amount_threshold = test_data['Amount'].quantile(0.9)
test_data['high_amount_flag'] = (test_data['Amount'] > amount_threshold).astype(int)
test_data['high_fraud_provider'] = test_data['ProviderId'].isin(['ProviderId_1', 'ProviderId_3', 'ProviderId_5']).astype(int)
test_data['high_fraud_channel'] = test_data['ChannelId'].isin(['ChannelId_1', 'ChannelId_3', 'ChannelId_2']).astype(int)
high_fraud_categories = ['transport', 'utility_bill', 'financial_services']
test_data['high_fraud_category'] = test_data['ProductCategory'].isin(high_fraud_categories).astype(int)

# Transform test data using the fitted preprocessor
X_test_processed = preprocessor.transform(test_data)



In [7]:
# Check X,y train
print(X_train.shape, y_train.shape)
print(X_test.shape)

(152750, 50) (152750,)
(19133, 50)
