In [1]:
# Import the required libraries
import pandas as pd
import numpy as np
import joblib
import sys
import os

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))

# Import the evaluation function
from src.model_evaluation import evaluate_model

In [2]:
# Step 1: Load the saved model, amount_threshold, preprocessor, and feature_names
model = joblib.load('../models/random_forest_model.joblib')
amount_threshold = joblib.load('../models/amount_threshold.joblib')
preprocessor = joblib.load('../models/preprocessor.joblib')
feature_names = joblib.load('../models/feature_names.joblib')

In [3]:
# Step 2: Load the preprocessed validation data
val_data = pd.read_csv('../data/preprocessed_val.csv')
X_val = val_data.drop('FraudResult', axis=1)
y_val = val_data['FraudResult']

In [6]:
print(val_data['FraudResult'].unique())
print(val_data['FraudResult'].value_counts(dropna=False))
print(val_data.dtypes)

[nan  0.  1.]
FraudResult
NaN    15224
0.0     3904
1.0        5
Name: count, dtype: int64
Amount                                float64
log_amount                            float64
hour                                  float64
day_of_week                           float64
month                                 float64
ProviderId_ProviderId_2               float64
ProviderId_ProviderId_3               float64
ProviderId_ProviderId_4               float64
ProviderId_ProviderId_5               float64
ProviderId_ProviderId_6               float64
ProductId_ProductId_10                float64
ProductId_ProductId_11                float64
ProductId_ProductId_12                float64
ProductId_ProductId_13                float64
ProductId_ProductId_14                float64
ProductId_ProductId_15                float64
ProductId_ProductId_16                float64
ProductId_ProductId_19                float64
ProductId_ProductId_2                 float64
ProductId_ProductId_20             

In [5]:
print(f"X_val.shape: {X_val.shape}")
print(f"y_val.shape: {y_val.shape}")
print(f"y_val has NaNs: {y_val.isnull().sum()}")
print(f"X_val and y_val aligned: {X_val.index.equals(y_val.index)}")

X_val.shape: (19133, 50)
y_val.shape: (19133,)
y_val has NaNs: 15224
X_val and y_val aligned: True


In [4]:
# Step 3: Evaluate the model on the validation split
evaluate_model(model, X_val, y_val, feature_names, output_prefix='validation')

Classification Report (validation):


  return x.astype(dtype, copy=copy, casting=casting)


ValueError: Input y_true contains NaN.

In [None]:
# Step 4: Load and preprocess the test data (for prediction)
test_data = pd.read_csv('../data/test.csv')

# Apply the same preprocessing steps as in preprocess_fraud_data (without fitting)
test_data = test_data.drop(columns=['Value'])
drop_cols = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId']
test_data = test_data.drop(columns=drop_cols)

# Feature Engineering
test_data['TransactionStartTime'] = pd.to_datetime(test_data['TransactionStartTime'])
test_data['hour'] = test_data['TransactionStartTime'].dt.hour
test_data['day_of_week'] = test_data['TransactionStartTime'].dt.dayofweek
test_data['month'] = test_data['TransactionStartTime'].dt.month
test_data = test_data.drop(columns=['TransactionStartTime'])

test_data['log_amount'] = np.log1p(test_data['Amount'].abs() + 1)
test_data['high_amount_flag'] = (test_data['Amount'] > amount_threshold).astype(int)
test_data['high_fraud_provider'] = test_data['ProviderId'].isin(['ProviderId_1', 'ProviderId_3', 'ProviderId_5']).astype(int)
test_data['high_fraud_channel'] = test_data['ChannelId'].isin(['ChannelId_1', 'ChannelId_3', 'ChannelId_2']).astype(int)
high_fraud_categories = ['transport', 'utility_bill', 'financial_services']
test_data['high_fraud_category'] = test_data['ProductCategory'].isin(high_fraud_categories).astype(int)

# Transform test data using the fitted preprocessor
X_test_processed = preprocessor.transform(test_data)

In [None]:
# Step 5: Generate predictions for the test data (test.csv)
test_predictions = model.predict(X_test_processed)
test_pred_proba = model.predict_proba(X_test_processed)[:, 1]

In [None]:
# Save test predictions
test_results = pd.DataFrame({
    'predicted': test_predictions,
    'fraud_probability': test_pred_proba
})
test_results.to_csv('test_predictions.csv', index=False)
print("Test predictions saved to test_predictions.csv")