# **Preprocessing**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import kagglehub

path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/creditcardfraud


In [None]:
!ls /kaggle/input/creditcardfraud

creditcard.csv


In [None]:
data = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')

In [None]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
data = data.drop_duplicates()

In [None]:
x = data.iloc[:, :-1]
y = data.iloc[:, -1].values

In [None]:
print(x.isnull().sum()) # -> there are no nulls
"""
# if there were nulls use
from sklearn.impute import SimpleImputer
imputer = SimpleImuter(missing_value=mp.nan, startegy='mean')
imputer.fit(data)
data = imputer.transform(data)
"""

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
dtype: int64


"\n# if there were nulls use\nfrom sklearn.impute import SimpleImputer\nimputer = SimpleImuter(missing_value=mp.nan, startegy='mean')\nimputer.fit(data)\ndata = imputer.transform(data)\n"

In [None]:
categorical_columns = x.select_dtypes(include=['object']).columns
print(categorical_columns) # -> no str columns

Index([], dtype='object')


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
import statsmodels.api as sm

def stepwise_selection(X, y, threshold_in=0.05, threshold_out=0.05, verbose=True):
    selected = []
    features = list(X.columns)

    # Forward Selection Phase
    # we always assume Ho: this feature is not matter if excluded
    # so if pval < thrsh we can reject Ho, and this feature is important
    while features:
        best_pval = 1
        best_feature = None

        for feature in features:
            X_temp = sm.add_constant(X[selected + [feature]])
            model = sm.OLS(y, X_temp).fit()
            pval = model.pvalues[feature]

            if pval < best_pval:
                best_pval = pval
                best_feature = feature

        if best_pval < threshold_in:
            selected.append(best_feature)
            features.remove(best_feature)
            if verbose:
                print(f"Forward: Added '{best_feature}' (p={best_pval:.4f})")
        else:
            break

    # Backward Elimination Phase
    # while selected:
    #     X_temp = sm.add_constant(X[selected])
    #     model = sm.OLS(y, X_temp).fit()
    #     pvals = model.pvalues[1:]

    #     worst_pval = pvals.max()
    #     worst_feature = pvals.idxmax()

    #     if worst_pval > threshold_out:
    #         selected.remove(worst_feature)
    #         if verbose:
    #             print(f"Backward: Removed '{worst_feature}' (p={worst_pval:.4f})")
    #     else:
    #         break

    return selected

selected_features = stepwise_selection(x_train, y_train, threshold_in=0.05, threshold_out=0.05, verbose=False)
print("Final selected features:", selected_features)

In [None]:
x_train = x_train[selected_features]
x_test = x_test[selected_features]

In [None]:
print('V20' in list(x_train.columns))

False


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
print(np.unique(y_train, return_counts=True)) # -> imbalance data set

(array([0, 1]), array([226597,    383]))


In [None]:
# balancing
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
steps = []
steps.append(SMOTE(sampling_strategy=0.95))
pipeline = make_pipeline(*steps)
x_train, y_train = pipeline.fit_resample(x_train, y_train)

In [None]:
print(np.unique(y_train, return_counts=True))

(array([0, 1]), array([226597, 215267]))


# **Modeling**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rf.fit(x_train, y_train)

In [None]:
y_pred = rf.predict(x_test)
print(y_pred)
print(y_test)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9995594403129736
[[56651     5]
 [   20    70]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.93      0.78      0.85        90

    accuracy                           1.00     56746
   macro avg       0.97      0.89      0.92     56746
weighted avg       1.00      1.00      1.00     56746

