#### Preprocess the data for modelling

In [1]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))

## Import the preprocessing function
from src.data_preprocessing import preprocess_data

In [2]:
# Step 1: Load the training data and check for NaN in FraudResult
train_data_path = '../data/training.csv'
train_data = pd.read_csv(train_data_path)
print("Number of NaN values in FraudResult before dropping:", train_data['FraudResult'].isna().sum())

# Drop rows with NaN in FraudResult
train_data = train_data.dropna(subset=['FraudResult'])
print("Number of rows after dropping NaN in FraudResult:", len(train_data))

# Ensure FraudResult is numeric
train_data['FraudResult'] = pd.to_numeric(train_data['FraudResult'], errors='coerce')
print("Number of NaN values in FraudResult after converting to numeric:", train_data['FraudResult'].isna().sum())

# Save the cleaned training data
train_data.to_csv('../data/cleaned_training.csv', index=False)

Number of NaN values in FraudResult before dropping: 0
Number of rows after dropping NaN in FraudResult: 95662
Number of NaN values in FraudResult after converting to numeric: 0


In [3]:
# Step 2: Preprocess the cleaned training data
(X_train, y_train, X_val, y_val, feature_names, preprocessor) = preprocess_data('../data/cleaned_training.csv', is_training=True)

In [4]:
# Step 3: Compute the amount threshold
amount_threshold = train_data['Amount'].quantile(0.9)

In [5]:
# Step 4: Preprocess the test data
test_data_path = '../data/test.csv'
test_data = pd.read_csv(test_data_path)

# Apply the same preprocessing steps (without fitting)
test_data = test_data.drop(columns=['Value'])
drop_cols = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId']
test_data = test_data.drop(columns=drop_cols)

# Feature Engineering
test_data['TransactionStartTime'] = pd.to_datetime(test_data['TransactionStartTime'])
test_data['hour'] = test_data['TransactionStartTime'].dt.hour
test_data['day_of_week'] = test_data['TransactionStartTime'].dt.dayofweek
test_data['month'] = test_data['TransactionStartTime'].dt.month
test_data = test_data.drop(columns=['TransactionStartTime'])

test_data['log_amount'] = np.log1p(test_data['Amount'].abs() + 1)
test_data['high_amount_flag'] = (test_data['Amount'] > amount_threshold).astype(int)
test_data['high_fraud_provider'] = test_data['ProviderId'].isin(['ProviderId_1', 'ProviderId_3', 'ProviderId_5']).astype(int)
test_data['high_fraud_channel'] = test_data['ChannelId'].isin(['ChannelId_1', 'ChannelId_3', 'ChannelId_2']).astype(int)
high_fraud_categories = ['transport', 'utility_bill', 'financial_services']
test_data['high_fraud_category'] = test_data['ProductCategory'].isin(high_fraud_categories).astype(int)

# Transform test data using the fitted preprocessor
X_test_processed = preprocessor.transform(test_data)



In [6]:
# Step 5: Save the preprocessed data and preprocessing objects
# Save training and validation splits as DataFrames
X_train_df = pd.DataFrame(X_train, columns=feature_names)
X_train_df['FraudResult'] = y_train
X_train_df.to_csv('../data/preprocessed_train.csv', index=False)

# Debug: Check for NaN in y_val before saving
print("Number of NaN values in y_val before saving:", y_val.isna().sum())
X_val_df = pd.DataFrame(X_val, columns=feature_names)
X_val_df['FraudResult'] = y_val
X_val_df.to_csv('../data/preprocessed_val.csv', index=False)

# Save preprocessed test data (no FraudResult column since test data doesn't have it)
X_test_df = pd.DataFrame(X_test_processed, columns=feature_names)
X_test_df.to_csv('../data/preprocessed_test.csv', index=False)

Number of NaN values in y_val before saving: 0


In [8]:
# # Debug: Check for NaN in y_val before saving
# print("Number of NaN values in y_val before saving:", y_val.isna().sum())
# # Convert y_val to integer to avoid floating-point issues
# y_val = pd.Series(y_val).astype(int)
# X_val_df = pd.DataFrame(X_val, columns=feature_names)
# Reset indices to ensure alignment
X_val_df = X_val_df.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
# Debug: Check lengths before assignment
print("Length of X_val_df:", len(X_val_df))
print("Length of y_val:", len(y_val))
# Check if lengths match
if len(X_val_df) != len(y_val):
    raise ValueError("Length mismatch between X_val_df and y_val!")
X_val_df['FraudResult'] = y_val
# Debug: Check for NaN and unique values after assignment
print("Number of NaN values in X_val_df['FraudResult'] after assignment:", X_val_df['FraudResult'].isna().sum())
print("Unique values in X_val_df['FraudResult'] after assignment:", X_val_df['FraudResult'].unique())
X_val_df.to_csv('../data/preprocessed_val.csv', index=False)

# Debug: Reload preprocessed_val.csv to check for NaN and unique values
val_data_check = pd.read_csv('../data/preprocessed_val.csv', dtype={'FraudResult': int})
print("Number of NaN values in preprocessed_val.csv after saving:", val_data_check['FraudResult'].isna().sum())
print("Unique values in preprocessed_val.csv['FraudResult'] after loading:", val_data_check['FraudResult'].unique())

Length of X_val_df: 19133
Length of y_val: 19133
Number of NaN values in X_val_df['FraudResult'] after assignment: 0
Unique values in X_val_df['FraudResult'] after assignment: [0 1]
Number of NaN values in preprocessed_val.csv after saving: 0
Unique values in preprocessed_val.csv['FraudResult'] after loading: [0 1]


In [9]:
# Save preprocessed test data (no FraudResult column since test data doesn't have it)
X_test_df = pd.DataFrame(X_test_processed, columns=feature_names)
X_test_df.to_csv('../data/preprocessed_test.csv', index=False)

In [10]:
# Save the amount_threshold, preprocessor, and feature_names
joblib.dump(amount_threshold, '../models/amount_threshold.joblib')
joblib.dump(preprocessor, '../models/preprocessor.joblib')
joblib.dump(feature_names, '../models/feature_names.joblib')

print("Preprocessing completed. Saved preprocessed_train.csv, preprocessed_val.csv, preprocessed_test.csv, amount_threshold, preprocessor, and feature_names.")

Preprocessing completed. Saved preprocessed_train.csv, preprocessed_val.csv, preprocessed_test.csv, amount_threshold, preprocessor, and feature_names.
