DecisionTree With the Numerical dataset.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def preprocess_data(data):
    # Define categorical columns
    categorical_cols = ["type"]
    numerical_cols = [col for col in data.columns if col not in categorical_cols + ["isFraud", "nameOrig", "nameDest", "isFlaggedFraud"]]
    
    # Preprocessing for numerical data
    numerical_transformer = StandardScaler()

    # Preprocessing for categorical data
    categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
    
    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Preprocessing pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

    data_preprocessed = pipeline.fit_transform(data.drop(['isFraud', 'nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1))
    return data_preprocessed, pipeline

def train_model(X_train, y_train):
    param_grid = {
        'max_depth': [4, 8, 12],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    model = DecisionTreeClassifier(random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)  # Use all threads
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

def main():
    data = pd.read_csv("archive/PS_20174392719_1491204439457_log.csv")

    # Check for missing values (if applicable)
    print(data.isnull().sum())

    data_preprocessed, pipeline = preprocess_data(data.copy())  # Avoid modifying original data

    # Handle class imbalance
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(data_preprocessed, data["isFraud"])

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

    # Train the model with hyperparameter tuning
    model = train_model(X_train, y_train)

    # Evaluate the model
    accuracy, precision, recall, f1 = evaluate_model(model, X_test, y_test)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

    # Confusion matrix for isFraud
    y_pred = model.predict(X_test)
    isfraud_cm = confusion_matrix(y_test, y_pred)
    print("\nisFraud Confusion Matrix:")
    print(isfraud_cm)

    # Count number of fraud activities
    fraud_count = data["isFraud"].sum()
    print(f"\nNumber of Fraud Activities: {fraud_count}")

    # Confusion matrix for isFlaggedFraud (assuming a threshold of 200,000 for flagging)
    data["isFlaggedFraud"] = (data["amount"] > 200000).astype(int)
    
    # Preprocess the data for isFlaggedFraud using the same pipeline
    data_flagged_preprocessed = pipeline.transform(data.drop(["isFlaggedFraud", "nameOrig", "nameDest"], axis=1))
    
    flaggedfraud_cm = confusion_matrix(data["isFlaggedFraud"], model.predict(data_flagged_preprocessed))
    print("\nisFlaggedFraud Confusion Matrix:")
    print(flaggedfraud_cm)

if __name__ == "__main__":
    main()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64
Accuracy: 0.9970
Precision: 0.9960
Recall: 0.9981
F1-score: 0.9971

isFraud Confusion Matrix:
[[1265735    5102]
 [   2403 1268523]]

Number of Fraud Activities: 8213

isFlaggedFraud Confusion Matrix:
[[4668498   20552]
 [1661560   12010]]
