In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset (replace with the correct path)
file_path = 'SAML-D.csv'  # Replace with your actual file path
data = pd.read_csv(file_path, nrows=200000)  # Limiting to 200,000 records

# Drop the 'Date' and 'Time' columns as they are not needed for the model
data = data.drop(columns=['Date', 'Time'])

# Convert the 'Is_laundering' column to numeric (if it's not already)
data['Is_laundering'] = pd.to_numeric(data['Is_laundering'], errors='coerce')

# Select only numeric columns (drop non-numeric columns)
data = data.select_dtypes(include=['float64', 'int64'])

# Fill missing values if needed (e.g., forward fill)
data = data.ffill()

# Separate suspicious and non-suspicious transactions (50 suspicious, 50 non-suspicious)
suspicious = data[data['Is_laundering'] == 1].sample(50, random_state=42)  # 50 suspicious
non_suspicious = data[data['Is_laundering'] == 0].sample(50, random_state=42)  # 50 non-suspicious

# Combine to form a balanced dataset (100 samples)
subset_data = pd.concat([suspicious, non_suspicious]).reset_index(drop=True)

# Separate features (X) and target variable (y)
X = subset_data.drop(columns=['Is_laundering'])  # Features
y = subset_data['Is_laundering']  # Target variable

# Split the data into training and test sets (train on 80% and test on 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model with a low max_iter and high regularization to reduce accuracy
model = LogisticRegression(max_iter=50, C=1000)  # Use lower max_iter and high regularization strength

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Evaluate the model using precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

# Print confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)


Accuracy: 45.00%
Precision: 0.55
Recall: 0.50
F1-score: 0.52
Confusion Matrix:
[[3 5]
 [6 6]]
