In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# ------------------------------------------------------
# 1. Load Data
# ------------------------------------------------------
df = pd.read_csv("creditcard.csv")   # change path

print("Initial shape:", df.shape)

# ------------------------------------------------------
# 2. Split features/target
# ------------------------------------------------------
X = df.drop("Class", axis=1)
y = df["Class"]

# ------------------------------------------------------
# 3. Identify numeric + categorical columns
# ------------------------------------------------------
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric:", numeric_features)
print("Categorical:", categorical_features)

# ------------------------------------------------------
# 4. Preprocessing pipelines
# ------------------------------------------------------
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# ------------------------------------------------------
# 5. Full pipeline (SMOTE → Preprocess → LR)
# ------------------------------------------------------
pipeline = ImbPipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("classifier", LogisticRegression(max_iter=1000))
    ]
)

# ------------------------------------------------------
# 6. Hyperparameter Optimization (GridSearchCV)
# ------------------------------------------------------
param_grid = {
    "classifier__penalty": ["l1", "l2"],
    "classifier__C": [0.01, 0.1, 1, 10],
    "classifier__solver": ["liblinear"],   # required for L1
    "classifier__class_weight": [None, "balanced"]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="f1",
    cv=3,
    n_jobs=-1,
    verbose=2
)

# ------------------------------------------------------
# 7. Train-test split
# ------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# ------------------------------------------------------
# 8. Fit GridSearchCV
# ------------------------------------------------------
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

# ------------------------------------------------------
# 9. Predict using best model
# ------------------------------------------------------
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# ------------------------------------------------------
# 10. Evaluation
# ------------------------------------------------------
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Initial shape: (284807, 31)
Numeric: ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
Categorical: []
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Parameters: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}

Accuracy: 0.9743864330606369

Confusion Matrix:
 [[55413  1451]
 [    8    90]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.97     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.97      0.99     56962

