In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import xgboost as xgb
from geopy.distance import geodesic


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/hrishikeshgawde/Desktop/Project to push on Github/Project_Credit_Card_Fraud_Transactions_Detection_Data_Pipeline/venv/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <636BF463-1886-392D-B8B3-6011C44DCEE9> /Users/hrishikeshgawde/Desktop/Project to push on Github/Project_Credit_Card_Fraud_Transactions_Detection_Data_Pipeline/venv/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file)"]


In [None]:
# Load dataset
train_df = pd.read_csv("fraudTrain.csv")
test_df = pd.read_csv("fraudTest.csv")


In [None]:
def preprocess_data(df):
    # Convert transaction time to datetime
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

    # Extract time-based features
    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day'] = df['trans_date_trans_time'].dt.day
    df['month'] = df['trans_date_trans_time'].dt.month
    df['dayofweek'] = df['trans_date_trans_time'].dt.dayofweek

    # Calculate distance between customer and merchant
    df['distance'] = df.apply(lambda row: geodesic((row['lat'], row['long']),
                                                   (row['merch_lat'], row['merch_long'])).km, axis=1)

    # Select relevant features
    features = ['amt', 'hour', 'day', 'month', 'dayofweek', 'distance', 'city_pop']
    X = df[features]
    y = df['is_fraud']
    return X, y

# Preprocess train and test data
X_train, y_train = preprocess_data(train_df)
X_test, y_test = preprocess_data(test_df)

In [2]:
# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

NameError: name 'StandardScaler' is not defined

In [1]:
# Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)
print("Logistic Regression Results:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_lr))

NameError: name 'LogisticRegression' is not defined

In [None]:
# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Results:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_rf))


In [None]:
# Train XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)


In [None]:
# Predictions
y_pred = xgb_model.predict(X_test)


In [None]:
# Model Evaluation
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))


In [None]:
# Visualize fraud vs. non-fraud transactions
plt.figure(figsize=(10, 6))
sns.countplot(x=train_df['is_fraud'])
plt.title('Fraud vs Non-Fraud Transaction Count')
plt.xlabel('Fraud Status')
plt.ylabel('Count')
plt.show()



In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='is_fraud', y='amt', data=train_df)
plt.ylim(0, 5000)
plt.title('Transaction Amount Distribution by Fraud Status')
plt.xlabel('Fraud Status')
plt.ylabel('Transaction Amount')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x=train_df['hour'], y=train_df['is_fraud'])
plt.title('Fraudulent Transactions by Hour of the Day')
plt.xlabel('Hour')
plt.ylabel('Fraud Count')
plt.show()
