In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv(r'C:\Users\haumr\OneDrive\Desktop\creditcard.csv')

# Drop the 'Time' column and scale the 'Amount' column
data['NormalizedAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data = data.drop(['Time', 'Amount'], axis=1)

# Separate the features and the target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
from sklearn.utils import resample

# Combine the training data back for resampling
train_data = pd.concat([X_train, y_train], axis=1)

# Separate the majority and minority classes
fraudulent = train_data[train_data.Class == 1]
non_fraudulent = train_data[train_data.Class == 0]

# Undersample the majority class
non_fraudulent_undersampled = resample(non_fraudulent, 
                                       replace=False,    # sample without replacement
                                       n_samples=len(fraudulent), # match minority class
                                       random_state=42)  # reproducible results

# Combine minority class with the undersampled majority class
undersampled_data = pd.concat([fraudulent, non_fraudulent_undersampled])

# Separate features and target
X_train_res = undersampled_data.drop('Class', axis=1)
y_train_res = undersampled_data['Class']
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_res, y_train_res)

# Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train_res, y_train_res)
from sklearn.metrics import precision_score, recall_score, f1_score

# Predict on the test set
y_pred_log_reg = log_reg.predict(X_test)
y_pred_rf_clf = rf_clf.predict(X_test)

# Calculate metrics for Logistic Regression
precision_log_reg = precision_score(y_test, y_pred_log_reg)
recall_log_reg = recall_score(y_test, y_pred_log_reg)
f1_log_reg = f1_score(y_test, y_pred_log_reg)

# Calculate metrics for Random Forest Classifier
precision_rf_clf = precision_score(y_test, y_pred_rf_clf)
recall_rf_clf = recall_score(y_test, y_pred_rf_clf)
f1_rf_clf = f1_score(y_test, y_pred_rf_clf)

# Print the metrics
print(f"Logistic Regression - Precision: {precision_log_reg}, Recall: {recall_log_reg}, F1-Score: {f1_log_reg}")
print(f"Random Forest Classifier - Precision: {precision_rf_clf}, Recall: {recall_rf_clf}, F1-Score: {f1_rf_clf}")


Logistic Regression - Precision: 0.03754693366708386, Recall: 0.9183673469387755, F1-Score: 0.07214428857715433
Random Forest Classifier - Precision: 0.040976058931860036, Recall: 0.9081632653061225, F1-Score: 0.07841409691629955
