In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
df = pd.read_csv('Fraud.csv')


In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()


In [None]:
#idetify outliers
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, orient='h')
plt.title('Boxplot of Features')
plt.show()

In [None]:
#visualize outliers
plt.figure(figsize=(12, 8)) 
sns.scatterplot(x='amount', y='oldbalanceOrg', hue='isFraud', data=df, alpha=0.5)
plt.title('Scatter Plot of Amount vs Old Balance (Origin) with Fraud Labels')
plt.xlabel('Amount')
plt.ylabel('Old Balance (Origin)')
plt.legend(title='isFraud', loc='upper right')
plt.show()

In [None]:
#visualize outliers with histogram of all numeric features
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numeric_features):
    plt.subplot(3, 3, i + 1)
    sns.histplot(df[feature], bins=30, kde=True)
    plt.title(f'Histogram of {feature}')
plt.tight_layout()
plt.show()

In [None]:
df['ratio_amount_to_oldbalance'] = df['amount'] / (df['oldbalanceOrg'] + 1e-6)

In [None]:
#visualize new feature , separated by isFraud
plt.figure(figsize=(12, 8))
sns.boxplot(x='isFraud', y='ratio_amount_to_oldbalance', data=df)
plt.title('Ratio of Amount to Old Balance by Fraud Status')
plt.xlabel('isFraud')
plt.ylabel('Ratio of Amount to Old Balance')
plt.show()
#histogram of new feature
plt.figure(figsize=(10, 6))
sns.histplot(df['ratio_amount_to_oldbalance'], bins=30, kde=True)
plt.title('Histogram of Ratio of Amount to Old Balance')
plt.xlabel('Ratio of Amount to Old Balance')
plt.ylabel('Frequency')
plt.show()


In [None]:
#we will have five new columns: type_CASH-IN, type_CASH-OUT, type_DEBIT, type_PAYMENT, and type_TRANSFER.
df = pd.get_dummies(df, columns=['type'], prefix='type', drop_first=True)



In [None]:
#balance_change_orig = newbalanceOrig - oldbalanceOrg
df['balance_change_orig'] = df['newbalanceOrig'] - df['oldbalanceOrg']
#balance_change_dest = newbalanceDest - oldbalanceDest
df['balance_change_dest'] = df['newbalanceDest'] - df['oldbalanceDest']

In [None]:
#handling step feature into two new features: day_of week and hour_of_day
df['hour_of_day'] = df['step'] % 24
df['day_of_week'] = (df['step'] // 24) % 7


In [None]:
#encodding day_of_week and hour_of_day
# Apply sine and cosine transformations for 'hour_of_day'
df['sin_hour'] = np.sin(2 * np.pi * df['hour_of_day'] / 24)
df['cos_hour'] = np.cos(2 * np.pi * df['hour_of_day'] / 24)
# Step 4: Apply sine and cosine transformations for 'day_of_week'
df['sin_day'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['cos_day'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

In [None]:
#The data dictionary notes that there is no balance information for customers who are merchants (nameDest starts with 'M').
#is_merchant_dest, which indicates whether the transaction's recipient is a merchant.
df['is_merchant_dest'] = df['nameDest'].str.startswith('M').astype(int)


In [None]:
#Drop oldbalanceOrg and newbalanceOrig, as you have the balance_change_orig feature.
df.drop(columns=['oldbalanceOrg', 'newbalanceOrig'], inplace=True)

#Drop nameDest, as you have the is_merchant_dest feature which captures the key information from that column.
df.drop(columns=['nameDest'], inplace=True)

#rop the original step column and its intermediary features, keeping only the sine and cosine transformations.
df.drop(columns=['step', 'hour_of_day', 'day_of_week'], inplace=True)

In [None]:
#Log Transformation of Skewed Features
skewed_features = ['amount', 'ratio_amount_to_oldbalance']
for feature in skewed_features:
    df[feature] = np.log1p(df[feature])  
    


In [None]:
# Calculate the skewness for the balance_change features
skewness_orig = df['balance_change_orig'].skew()
skewness_dest = df['balance_change_dest'].skew()

print(f"Skewness of balance_change_orig: {skewness_orig}")
print(f"Skewness of balance_change_dest: {skewness_dest}")

In [None]:
df['balance_change_dest_log'] = np.log1p(df['balance_change_dest'])

from scipy.stats import boxcox, yeojohnson

# Applying Yeo-Johnson transformation to balance_change_orig
df['balance_change_orig_yeo'], _ = yeojohnson(df['balance_change_orig'])

In [None]:
#skewness after transformation
skewness_orig_yeo = df['balance_change_orig_yeo'].skew()
skewness_dest_log = df['balance_change_dest_log'].skew()
print(f"Skewness of balance_change_orig after Yeo-Johnson: {skewness_orig_yeo}")
print(f"Skewness of balance_change_dest after log transformation: {skewness_dest_log}")

In [None]:
print("Checking for negative values in 'balance_change_dest' before transformation:")
negative_values = df[df['balance_change_dest'] < 0]['balance_change_dest'].count()
print(f"Number of negative values: {negative_values}")

In [None]:
# Applying Yeo-Johnson transformation to balance_change_dest
df['balance_change_dest_yeo'], _ = yeojohnson(df['balance_change_dest'])

# Check the skewness of the new transformed feature
print("Skewness of balance_change_dest after Yeo-Johnson transformation:")
print(df['balance_change_dest_yeo'].skew())

In [None]:
df.columns

In [None]:
df.drop(columns=['nameOrig', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud'], inplace=True)

In [None]:
#split the data into features and target variable
X = df.drop(columns=['isFraud'])
y = df['isFraud']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:

from imblearn.over_sampling import SMOTE

# Replace inf/-inf with NaN, then impute missing values in X_train before SMOTE
X_train_clean = X_train.replace([np.inf, -np.inf], np.nan)
X_train_imputed = X_train_clean.fillna(X_train_clean.mean())

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_imputed, y_train)



In [None]:

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc



In [None]:

# Step 1: Model Training
# Initialize the XGBoost classifier
# The 'use_label_encoder=False' and 'eval_metric="logloss"' are to avoid a future warning
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

# Fit the model to the oversampled training data
print("Training the XGBoost model...")
xgb_model.fit(X_train_resampled, y_train_resampled)
print("Model training complete.")


In [None]:

# Step 2: Model Evaluation on the original test set
# Make predictions on the test data
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# Calculate and print evaluation metrics
print("\nModel Performance on Test Data:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))


In [None]:

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:

# ROC-AUC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:

# Step 3: Feature Importance
# Plot feature importance
feature_importance = xgb_model.feature_importances_
feature_names = X_train_resampled.columns
sorted_idx = feature_importance.argsort()

plt.figure(figsize=(10, 8))
plt.barh(feature_names[sorted_idx], feature_importance[sorted_idx])
plt.xlabel("Feature Importance")
plt.title("XGBoost Feature Importance")
plt.show()

# You can also get a list of the most important features
important_features = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

print("\nTop 10 most important features:")
print(important_features.head(10))