In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Initial Data Diagnosis

In [2]:
df = pd.read_csv('creditcard.csv')

In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.175161e-15,3.384974e-16,-1.379537e-15,2.094852e-15,1.021879e-15,1.494498e-15,-5.620335e-16,1.149614e-16,-2.414189e-15,...,1.62862e-16,-3.576577e-16,2.618565e-16,4.473914e-15,5.109395e-16,1.6861e-15,-3.661401e-16,-1.227452e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


- Dataset consists of 31 quantititave variables, with the target being "Class". Apart from "Time" and "Amount", all other variables are quite standardized with mean 0 and standard deviation of roughly 1.

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

- Data is efficiently stored in float64 and int64 values, taking up 67.4 MB

In [6]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [7]:
print(df.Class.value_counts())
print(df.duplicated().sum())

Class
0    284315
1       492
Name: count, dtype: int64
1081


- No null values but some duplications.

## Data Preparation

### Duplication

In [8]:
df.drop_duplicates(inplace=True)
df.drop("Time", axis=1,inplace=True)
df.shape

(283726, 30)

- Drop identical duplicates for reduced dataset bias and column "Time" to focus on static modeling. 

### Multicollinearity

In [9]:
design_matrix = df.drop("Class", axis=1)
correlation_matrix = design_matrix.corr()

determinant_correlation_matrix = np.linalg.det(correlation_matrix)
print(f"Determinant of the correlation matrix: {determinant_correlation_matrix:.10f}")

Determinant of the correlation matrix: 0.0808995550


- From this determinant value of the correlation matrix, we recognize that the given data has very high multicollinearity. This means that we would need to find the features being highly correlated to other features to ensure that the solver is stable during the fitting process. To do that, we need to utilize VIF.

In [10]:
X = sm.add_constant(design_matrix)

# Create a new DataFrame to store the VIF scores
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns

# Calculate VIF for each variable using a list comprehension
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print("--- VIF Scores ---")
print(vif_data)

--- VIF Scores ---
   Variable        VIF
0     const   2.529619
1        V1   1.633287
2        V2   4.466661
3        V3   1.542250
4        V4   1.121088
5        V5   2.816732
6        V6   1.572235
7        V7   2.907451
8        V8   1.131088
9        V9   1.024158
10      V10   1.123993
11      V11   1.000245
12      V12   1.001702
13      V13   1.000372
14      V14   1.014246
15      V15   1.000101
16      V16   1.000656
17      V17   1.001508
18      V18   1.015784
19      V19   1.038426
20      V20   2.403376
21      V21   1.135968
22      V22   1.052271
23      V23   1.159593
24      V24   1.000351
25      V25   1.028632
26      V26   1.000168
27      V27   1.011017
28      V28   1.001596
29   Amount  12.295411


- Based on the VIF scores, we would want to eliminate "Amount" first.

In [11]:
design_matrix = df.drop(["Class", "Amount"], axis=1)
correlation_matrix = design_matrix.corr()

determinant_correlation_matrix = np.linalg.det(correlation_matrix)
print(f"Determinant of the correlation matrix: {determinant_correlation_matrix:.10f}")

Determinant of the correlation matrix: 0.9946932395


- We now have a highly non-linearly dependent dataset that will ensure stable solutions.

In [12]:
df.drop("Amount", axis=1,inplace=True)

### Class Imbalance

In [13]:
df['Class'].value_counts(normalize=True) * 100

Class
0    99.83329
1     0.16671
Name: proportion, dtype: float64

- As seen from this proportion, the positive events are extremely dominated by the negative events. To address this, methods like class weights, SMOTE, and undersampling might be implemented.

### Logit Linearity

In [16]:
X = df.drop("Class", axis=1)
Y = df["Class"]

features = X.columns.tolist()

linearity_scores = {}

# --- 2. Loop Through Each Feature and Plot on a Specific Axis ---
for i, predictor_col in enumerate(features):

    # Create a temporary DataFrame for the check
    check_df = pd.DataFrame({
        'predictor': X[predictor_col],
        'target': Y
    })

    # Bin the predictor
    check_df['predictor_bin'] = pd.qcut(check_df['predictor'], q=10, duplicates='drop')

    # Calculate log-odds per bin
    logit_df = check_df.groupby('predictor_bin', observed=False).agg(
        mean_predictor=('predictor', 'mean'),
        n_positive=('target', 'sum'),
        n_total=('target', 'count')
    ).reset_index()

    logit_df['p_positive'] = (logit_df['n_positive'] + 0.5) / (logit_df['n_total'] + 1)
    logit_df['log_odds'] = np.log(logit_df['p_positive'] / (1 - logit_df['p_positive']))

    X_fit = logit_df[['mean_predictor']]
    y_fit = logit_df['log_odds']

    # Fit a linear model to the binned points
    model = LinearRegression()
    model.fit(X_fit, y_fit)
    
    # Get predictions and calculate MSE
    predictions = model.predict(X_fit)
    mse = mean_squared_error(y_fit, predictions)
    
    linearity_scores[predictor_col] = mse

# --- Sort the features by their non-linearity score (higher is worse) ---
sorted_scores = sorted(linearity_scores.items(), key=lambda item: item[1], reverse=True)

print("--- Non-Linearity Scores (Higher MSE = More Non-Linear) ---")
for feature, score in sorted_scores:
    print(f"{feature:<20}: {score:.4f}")

--- Non-Linearity Scores (Higher MSE = More Non-Linear) ---
V17                 : 2.3402
V21                 : 1.3178
V16                 : 1.2128
V27                 : 1.1308
V14                 : 1.1128
V28                 : 1.1039
V7                  : 1.0000
V10                 : 0.9564
V2                  : 0.8646
V8                  : 0.7470
V18                 : 0.6840
V11                 : 0.6641
V19                 : 0.5210
V5                  : 0.4949
V20                 : 0.4706
V6                  : 0.4118
V1                  : 0.4009
V23                 : 0.3537
V9                  : 0.3318
V3                  : 0.2598
V12                 : 0.2449
V26                 : 0.1465
V4                  : 0.1442
V24                 : 0.1169
V25                 : 0.0939
V13                 : 0.0693
V15                 : 0.0520
V22                 : 0.0345


- To improve logistic regression performance further, we can eliminate the top 5 non-linear variables (in the logit sense). 

In [None]:
df.drop(["V17", "V21", "V16", "V27", "V14"], axis=1, inplace=True)

### Outliers

## Feature Engineering