In [10]:
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix

In [19]:

# Load model, scaler, and preprocessed test data
model = joblib.load('logistic_model.pkl')
scaler = joblib.load('scaler.pkl')
X_test = joblib.load('X_test.pkl')
y_test = joblib.load('y_test.pkl')

# Predict probabilities
probas = model.predict_proba(X_test)[:, 1]

# Set threshold
threshold = 0.65
predicted_classes = (probas > threshold).astype(int)



In [21]:

# Create a DataFrame for display
output = pd.DataFrame({
    'Actual_Fraud': y_test,
    'Predicted_Fraud': predicted_classes,
    'Fraud_Probability': probas
})

# Print predictions
print("\n📄 Sample Predictions:")
print(output.head())




📄 Sample Predictions:
   Actual_Fraud  Predicted_Fraud  Fraud_Probability
0             0                0           0.247768
1             0                0           0.405724
2             0                0           0.141565
3             0                0           0.502077
4             0                0           0.202949


In [22]:
# Save predictions to CSV
output.to_csv("predicted_test.csv", index=False)
print("\n✅ Predictions saved to 'predicted_test_set.csv'")




✅ Predictions saved to 'predicted_test_set.csv'


In [23]:
# Evaluate model performance on test set
acc = accuracy_score(y_test, predicted_classes)
prec = precision_score(y_test, predicted_classes)
rec = recall_score(y_test, predicted_classes)
f1 = f1_score(y_test, predicted_classes)

print("\n📊 Model Evaluation:")
print(f"  Accuracy : {acc:.4f}")
print(f"  Precision: {prec:.4f}")
print(f"  Recall   : {rec:.4f}")
print(f"  F1 Score : {f1:.4f}")


📊 Model Evaluation:
  Accuracy : 0.9535
  Precision: 0.0412
  Recall   : 0.4965
  F1 Score : 0.0761


In [1]:
# === 3. predict.ipynb ===

import joblib
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score




In [2]:
# Load scaler and preprocessed test data
scaler = joblib.load('scaler.pkl')
X_test = joblib.load('X_test.pkl')
y_test = joblib.load('y_test.pkl')

In [3]:
# Load individual models
logistic = joblib.load('logistic_model.pkl')
xgb = joblib.load('xgb_model.pkl')

# Combine models into a soft voting ensemble
ensemble_model = VotingClassifier(estimators=[
    ('lr', logistic),
    ('xgb', xgb)
], voting='soft')

# Fit ensemble model on training-like data (if needed)
# Since VotingClassifier needs to fit the meta-model, we re-use X_test just for demo purposes
# Ideally, use a validation set here instead
ensemble_model.fit(X_test, y_test)



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [4]:
# Save the ensemble model
joblib.dump(ensemble_model, 'ensemble_model.pkl')

['ensemble_model.pkl']

In [5]:
# Predict probabilities and apply threshold
probas = ensemble_model.predict_proba(X_test)[:, 1]
threshold = 0.6
predicted_classes = (probas > threshold).astype(int)

# Create a DataFrame for display
output = pd.DataFrame({
    'Actual_Fraud': y_test,
    'Predicted_Fraud': predicted_classes,
    'Fraud_Probability': probas
})


In [6]:

# Print predictions
print("\n📄 Sample Predictions:")
print(output.head())

# Save predictions to CSV
output.to_csv("predicted_test_set.csv", index=False)
print("\n✅ Predictions saved to 'predicted_test_set.csv'")



📄 Sample Predictions:
   Actual_Fraud  Predicted_Fraud  Fraud_Probability
0             0                0           0.000173
1             0                0           0.000202
2             0                0           0.294046
3             0                0           0.347567
4             0                0           0.001147

✅ Predictions saved to 'predicted_test_set.csv'


In [7]:

# Evaluate model performance on test set
acc = accuracy_score(y_test, predicted_classes)
prec = precision_score(y_test, predicted_classes)
rec = recall_score(y_test, predicted_classes)
f1 = f1_score(y_test, predicted_classes)

print("\n📊 Model Evaluation:")
print(f"  Accuracy : {acc:.4f}")
print(f"  Precision: {prec:.4f}")
print(f"  Recall   : {rec:.4f}")
print(f"  F1 Score : {f1:.4f}")


📊 Model Evaluation:
  Accuracy : 0.9971
  Precision: 0.5765
  Recall   : 0.9077
  F1 Score : 0.7052


In [11]:
# Confusion matrix
cm = confusion_matrix(y_test, predicted_classes)
print("\n📉 Confusion Matrix:")
print(cm)


📉 Confusion Matrix:
[[552144   1430]
 [   198   1947]]


In [14]:
# Filter for True Positives: Actual = 1 and Predicted = 1
true_frauds = output[(output['Actual_Fraud'] == 1) & (output['Predicted_Fraud'] == 1)]

# Print all matching rows
print("\n🔥 All Confirmed Fraud Cases (Actual = Predicted = 1):")
print(true_frauds)



🔥 All Confirmed Fraud Cases (Actual = Predicted = 1):
        Actual_Fraud  Predicted_Fraud  Fraud_Probability
1767               1                1           0.952024
1781               1                1           0.933400
1784               1                1           0.960001
1857               1                1           0.880628
1906               1                1           0.962429
...              ...              ...                ...
517197             1                1           0.944411
517274             1                1           0.939008
517341             1                1           0.974178
517529             1                1           0.983722
517571             1                1           0.965712

[1947 rows x 3 columns]


In [17]:
# Manually access the specific row
print("\n Fraud Case :")
print(output.loc[1767])



 Fraud Case :
Actual_Fraud         1.000000
Predicted_Fraud      1.000000
Fraud_Probability    0.952024
Name: 1767, dtype: float64


In [18]:
# Manually access the specific row
print("\n Fraud Case :")
print(output.loc[1095])



 Fraud Case :
Actual_Fraud         0.000000
Predicted_Fraud      0.000000
Fraud_Probability    0.211552
Name: 1095, dtype: float64
