In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE

In [3]:
data = pd.read_csv('/content/creditcard.csv')
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124856,77448,-0.835868,0.673600,1.904021,-0.263201,0.351435,0.236937,0.223675,0.334808,-0.232802,...,0.114441,0.182323,-0.105240,-0.357454,-0.357218,-0.704835,0.014996,0.144135,4.99,0.0
124857,77448,-2.817937,-0.107162,0.402607,-0.980962,0.682698,0.980801,0.117625,1.387942,-0.779965,...,0.280573,0.323453,-0.374319,-0.962394,0.422271,0.402743,-0.288024,-0.339470,115.00,0.0
124858,77449,-1.262504,1.434804,0.290138,-0.436888,0.801078,0.380510,0.399066,0.554237,-0.531654,...,0.195197,0.383922,-0.281029,-1.152876,0.030816,-0.486364,-0.148485,0.121988,1.07,0.0
124859,77449,1.087845,0.522904,0.275704,2.474583,0.213114,-0.225345,0.389247,-0.100848,-0.960723,...,0.007535,-0.054694,-0.061887,0.083010,0.527815,0.030754,-0.017737,0.020217,45.95,0.0


In [4]:
# Basic information
print("Dataset Information:")
print(data.info())
print("\nClass distribution:")
print(data['Class'].value_counts())


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124861 entries, 0 to 124860
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    124861 non-null  int64  
 1   V1      124861 non-null  float64
 2   V2      124861 non-null  float64
 3   V3      124861 non-null  float64
 4   V4      124860 non-null  float64
 5   V5      124860 non-null  float64
 6   V6      124860 non-null  float64
 7   V7      124860 non-null  float64
 8   V8      124860 non-null  float64
 9   V9      124860 non-null  float64
 10  V10     124860 non-null  float64
 11  V11     124860 non-null  float64
 12  V12     124860 non-null  float64
 13  V13     124860 non-null  float64
 14  V14     124860 non-null  float64
 15  V15     124860 non-null  float64
 16  V16     124860 non-null  float64
 17  V17     124860 non-null  float64
 18  V18     124860 non-null  float64
 19  V19     124860 non-null  float64
 20  V20     124860 non-null  fl

In [5]:
# Splitting features and target
X = data.drop('Class', axis=1)
y = data['Class']


In [6]:
# Scale features (important for models like Logistic Regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [8]:
# Address class imbalance with SMOTE
# Before applying SMOTE, remove rows with NaN in the target variable 'Class'
data = data.dropna(subset=['Class'])  # Drop rows with NaN in 'Class'

# Now, proceed with feature/target split and SMOTE
X = data.drop('Class', axis=1)
y = data['Class']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

In [9]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)


Model Training (e.g., Logistic Regression)

In [10]:
from sklearn.linear_model import LogisticRegression


In [11]:
# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)


In [12]:
# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]


In [13]:
# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_proba))


Confusion Matrix:
[[36406   901]
 [ 2337 35117]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96     37307
         1.0       0.97      0.94      0.96     37454

    accuracy                           0.96     74761
   macro avg       0.96      0.96      0.96     74761
weighted avg       0.96      0.96      0.96     74761


ROC AUC Score: 0.9906799575916455
