In [1]:
import numpy as np
import pandas as pd 

In [2]:
np.random.seed(42)

# 1000 samples
n_samples = 1000

# Features
age = np.random.randint(18, 60, n_samples)
salary = np.random.randint(20000, 150000, n_samples)

# Imbalanced target
# 0 = Normal (950)
# 1 = Fraud (50)
target = np.array([0]*950 + [1]*50)

# Shuffle
np.random.shuffle(target)

data = pd.DataFrame({
    "Age": age,
    "Salary": salary,
    "Fraud": target
})

data.head()


Unnamed: 0,Age,Salary,Fraud
0,56,146108,0
1,46,34382,0
2,32,104291,0
3,25,121195,0
4,38,23756,0


In [4]:
data["Fraud"].value_counts()

Fraud
0    950
1     50
Name: count, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [6]:
X = data[["Age", "Salary"]]
y = data["Fraud"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)


0.95

In [7]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      1.00      0.97       190
           1       0.00      0.00      0.00        10

    accuracy                           0.95       200
   macro avg       0.47      0.50      0.49       200
weighted avg       0.90      0.95      0.93       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
model = LogisticRegression(class_weight="balanced")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.45      0.61       190
           1       0.05      0.50      0.08        10

    accuracy                           0.46       200
   macro avg       0.50      0.48      0.35       200
weighted avg       0.90      0.46      0.59       200



SMOTE (BEST & MOST USED ðŸ”¥)

In [11]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

y_train_sm.value_counts()


Fraud
0    760
1    760
Name: count, dtype: int64

In [12]:
model.fit(X_train_sm, y_train_sm)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.51      0.66       190
           1       0.06      0.60      0.11        10

    accuracy                           0.51       200
   macro avg       0.51      0.55      0.39       200
weighted avg       0.92      0.51      0.63       200

