In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
try:
    data = pd.read_csv('fraudData.csv')
except FileNotFoundError:
    print("Error: 'fraudData.csv' not found. Please ensure the file exists in the current directory.")
    exit()

# Inspect missing values
print(data.isnull().sum())

print(data.columns)

# Preprocess the data
le = LabelEncoder()
if 'Type' in data.columns:
    data['Type'] = le.fit_transform(data['Type'])
else:
    print("Warning: 'Type' column not found in the dataset. Skipping label encoding.")

if 'Amount' not in data.columns:
    print("Error: 'Amount' column not found in the dataset. Please check your CSV file.")
    exit()
else:
    data['Amount_Category'] = pd.cut(data['Amount'], bins=[0, 100, 500, 1000, float('inf')],
                                     labels=['Low', 'Medium', 'High', 'Very High'], right=False)
    data['Amount_Category'] = le.fit_transform(data['Amount_Category'])

# Prepare features (X) and target (y)

target_column = 'isFraud'
if target_column not in data.columns:
    target_column = 'Is Fraud'
    if target_column not in data.columns:
        target_column = 'fraud'
        if target_column not in data.columns:
            print(f"Error: Neither 'isFraud', 'Is Fraud' nor 'fraud' columns found in the dataset. Please check your CSV file and ensure the target variable column is present.")
            exit()
        else:
            print(f"Warning: Using column '{target_column}' as target variable. Please update the code to use 'isFraud' if that's the intended target column.")
    else:
        print(f"Warning: Using column '{target_column}' as target variable. Please update the code to use 'isFraud' if that's the intended target column.")

X = data.drop(target_column, axis=1)
y = data[target_column]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


categorical_cols = X_train.select_dtypes(include=['object']).columns

# Apply Label Encoding to all categorical columns
for col in categorical_cols:
    if col in X_train.columns:

        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col] = le.transform(X_test[col])
    else:
        print(f"Warning: Categorical column '{col}' not found in X_train. Skipping label encoding for this column.")

# Train a Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


print("\nRecommendations for improvement:")
print("- Explore other classification models like Random Forest or Gradient Boosting.")
print("- Perform more advanced feature engineering, considering transaction time, location, and user behavior.")
print("- Use hyperparameter tuning to optimize the Decision Tree model.")
print("- Address class imbalance issues if present in the dataset.")
print("- Consider incorporating external data sources (e.g., fraud databases) for improved accuracy.")

step           0
customer       0
age            0
gender         0
zipcodeOri     0
merchant       0
zipMerchant    0
category       0
amount         0
fraud          0
dtype: int64
Index(['step', 'customer', 'age', 'gender', 'zipcodeOri', 'merchant',
       'zipMerchant', 'category', 'amount', 'fraud'],
      dtype='object')
Error: 'Amount' column not found in the dataset. Please check your CSV file.
Precision: 0.7554157931516422
Recall: 0.76287932251235
F1-score: 0.7591292134831461

Recommendations for improvement:
- Explore other classification models like Random Forest or Gradient Boosting.
- Perform more advanced feature engineering, considering transaction time, location, and user behavior.
- Use hyperparameter tuning to optimize the Decision Tree model.
- Address class imbalance issues if present in the dataset.
- Consider incorporating external data sources (e.g., fraud databases) for improved accuracy.
