In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb

In [11]:
# Load the engineered dataset
df = pd.read_csv('mobile_money_features.csv')
print("Dataset loaded. Shape:", df.shape)

Dataset loaded. Shape: (1000, 18)


In [12]:
# Define features and target
features = [
    'Amount', 'Rolling_Count_1h', 'Time_Delta', 'Rapid_Transaction',
    'Avg_Amount', 'Max_Amount', 'Min_Amount', 'Location_Change',
    'Unique_Devices', 'Send_Money_Ratio', 'Hour_of_Day'
]
X = df[features]
y = df['Fraud_Label']

In [13]:
# Encode categorical features (none in this subset, but included for completeness)
# If you want to use 'Location' or 'Transaction_Type', uncomment and encode:
# le = LabelEncoder()
# X['Location'] = le.fit_transform(df['Location'])
# X['Transaction_Type'] = le.fit_transform(df['Transaction_Type'])

In [14]:
# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [15]:
# Debug: Check class distribution
print("\nClass distribution in y_train:", np.bincount(y_train))
print("Class distribution in y_test:", np.bincount(y_test))


Class distribution in y_train: [800]
Class distribution in y_test: [200]


In [16]:
# Calculate class weights to handle imbalance (fraud is rare)
scale_pos_weight = (y == 0).sum() / (y == 1).sum()  # Ratio of negative to positive samples

  scale_pos_weight = (y == 0).sum() / (y == 1).sum()  # Ratio of negative to positive samples


In [17]:
# Initialize and train XGBoost model
model = xgb.XGBClassifier(
    scale_pos_weight=scale_pos_weight,  # Adjust for imbalance
    use_label_encoder=False,            # Avoid deprecation warning
    eval_metric='logloss',              # Loss function
    random_state=42
)
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [18]:
# Predict on test set
y_pred = model.predict(X_test)

In [19]:
# Evaluate the model
unique_classes = np.unique(y_test)
if len(unique_classes) == 1:
    print("\nWarning: Only one class present in y_test. Classification report limited.")
    print(f"Predicted all as class {unique_classes[0]}: {np.mean(y_pred == unique_classes[0]) * 100:.2f}% accuracy")
else:
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Legit', 'Fraud']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Predicted all as class 0: 100.00% accuracy

Confusion Matrix:
[[200]]




In [20]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)


Feature Importance:
              Feature  Importance
0              Amount         0.0
1    Rolling_Count_1h         0.0
2          Time_Delta         0.0
3   Rapid_Transaction         0.0
4          Avg_Amount         0.0
5          Max_Amount         0.0
6          Min_Amount         0.0
7     Location_Change         0.0
8      Unique_Devices         0.0
9    Send_Money_Ratio         0.0
10        Hour_of_Day         0.0


In [21]:
# Save the predictions
df_test = X_test.copy()
df_test['Actual_Label'] = y_test
df_test['Predicted_Label'] = y_pred
df_test.to_csv('fraud_predictions.csv', index=False)
print("Predictions saved as 'fraud_predictions.csv'")

Predictions saved as 'fraud_predictions.csv'


In [22]:
pip install imbalanced-learn




In [23]:
pip install pandas numpy scikit-learn xgboost imbalanced-learn streamlit matplotlib seaborn

Note: you may need to restart the kernel to use updated packages.
