## Importing libraries

In [109]:
# Import pandas for data manipulation and analysis
import pandas as pd

# Import numpy for numerical operations
import numpy as np  

# Import seaborn for data visualization
import seaborn as sns

# Import matplotlib for plotting
import matplotlib.pyplot as plt

# Import StandardScaler for feature scaling
from sklearn.preprocessing import StandardScaler

# Import train_test_split for splitting data into training and testing sets
from sklearn.model_selection import train_test_split

# Import DecisionTreeClassifier for decision tree classification
from sklearn.tree import DecisionTreeClassifier

# Import LogisticRegression for logistic regression classification
from sklearn.linear_model import LogisticRegression

# Import make_classification for generating a random n-class classification problem
from sklearn.datasets import make_classification

# Import accuracy_score and classfication_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Import resample for upsampling/downsampling the dataset
from sklearn.utils import resample




## Importing dataset

In [110]:
# Read the CSV file into a DataFrame 'df'
df = pd.read_csv("creditcard.csv")

# Display the first few rows of the DataFrame to get an overview of the data
print(df.head())


   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

## Data Exploration

In [111]:
# Display the last few rows of the DataFrame to get an overview of the data
print(df.tail())

            Time         V1         V2        V3        V4        V5  \
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  \
284802 -2.606837 -4.918215  7.305334  1.914428  ...  0.213454  0.111864   
284803  1.058415  0.024330  0.294869  0.584800  ...  0.214205  0.924384   
284804  3.031260 -0.296827  0.708417  0.432454  ...  0.232045  0.578229   
284805  0.623708 -0.686180  0.679145  0.392087  ...  0.265245  0.800049   
284806 -0.649617  1.577006 -0.414650  0.486180  ...  0.261057  0.643078   

             V23       V24       V25       V26       V27       V28  Amount  \
284802  1.014480 -0.509348  1.436807  

In [112]:
# Generate descriptive statistics for the dataframe
# This includes count, mean, standard deviation, min, 25th percentile, median (50th percentile), 75th percentile, and max
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,...,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [113]:
# Display a concise summary of the DataFrame, including data types and non-null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [114]:
# Display the dimensions of the DataFrame
df.shape

(284807, 31)

In [115]:
# Check the number of missing values in each column of the DataFrame
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [116]:
# Initialize StandardScaler
sc = StandardScaler()

# Standardize the 'Amount' column and replace it with the scaled values in the DataFrame
df['Amount'] = sc.fit_transform(pd.DataFrame(df['Amount']))

In [117]:
# Display the first few rows of the DataFrame to get an overview of the data
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0


In [118]:
# Drop the 'Time' column from the DataFrame
df = df.drop(['Time'], axis=1)

# Display the first few rows of the DataFrame to get an overview of the data
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0


In [119]:
# Check if there are any duplicate rows in the DataFrame
df.duplicated().any()

True

In [120]:
# Drop duplicate rows from the DataFrame
df = df.drop_duplicates()

# Display the dimensions of the updated DataFrame
df.shape

(275663, 30)

In [121]:
# Display the counts of unique values in the 'Class' column of the DataFrame
print(df['Class'].value_counts())

Class
0    275190
1       473
Name: count, dtype: int64


## Undersampling

In [125]:
# Create separate DataFrames for fraudulent and normal transactions
Fraud_df = df[df['Class'] == 1]    # Select rows where 'Class' is 1 (fraudulent transactions)
Normal_df = df[df['Class'] == 0]   # Select rows where 'Class' is 0 (normal transactions)

In [126]:
# Display the dimensions of the Fraud_df DataFrame
print(Fraud_df.shape)

(473, 30)


In [127]:
# Display the dimensions of the Normal_df DataFrame
print(Normal_df.shape)

(275190, 30)


In [128]:
# Sample a subset of normal transactions from Normal_df
Normal_df_util = Normal_df.sample(n=473)

# Display the dimensions of the Normal_df_util DataFrame
print(Normal_df_util.shape)

(473, 30)


In [129]:
# Concatenate Normal_df_util and Fraud_df DataFrames into df1
df1 = pd.concat([Normal_df_util, Fraud_df], ignore_index=True)

# Display the first few rows of df1 DataFrame
print(df1.head())

         V1        V2        V3        V4        V5        V6        V7  \
0 -0.201500  1.141593 -0.013911 -0.190544  0.907019 -1.204594  1.119430   
1  0.552554  0.119661  0.733838 -0.232615 -0.447473 -0.041879 -0.434158   
2 -1.271324 -0.170150  2.553263  0.151341  0.583891  0.286320  0.217556   
3 -1.711121 -0.369377  1.440787  0.136071  0.517265  0.196718 -0.006951   
4 -2.167349  0.636853 -2.434659 -0.889918 -3.757431  1.608337  5.032697   

         V8        V9       V10  ...       V21       V22       V23       V24  \
0 -0.391805 -0.440344 -1.043049  ...  0.281943  0.955781 -0.347500 -0.033994   
1 -0.758384  0.964158 -0.710571  ...  1.039238  0.663868 -0.112848  1.127279   
2  0.156030 -0.032531 -0.499367  ...  0.225198  0.553539 -0.246722 -0.346338   
3  0.325528 -1.817459  1.052207  ... -0.365864 -0.524456  0.085383 -0.342706   
4 -0.363841 -0.047509 -1.054279  ... -0.438900 -0.266665 -0.137940 -0.636382   

        V25       V26       V27       V28    Amount  Class  
0  0.35

In [130]:
# Count the occurrences of each class in the 'Class' column of the df1 DataFrame
print(df1['Class'].value_counts())

Class
0    473
1    473
Name: count, dtype: int64


In [131]:
# Drop the 'Class' column from the df1 DataFrame and assign the result to util1 (features)
util1 = df1.drop('Class', axis = 1)

# Assign the 'Class' column to util2 (target variable)
util2= df1['Class']

In [132]:
# Display the dimensions (shape) of the util1 DataFrame
print(util1.shape)

(946, 29)


In [133]:
# Display the dimensions (shape) of the util2 DataFrame
print(util2.shape)

(946,)


In [134]:
# Split the resampled data into training and testing sets
# X_resampled: features
# Y_resampled: target variable
# test_size=0.3: 30% of the data will be used for testing, and 70% for training
# random_state=42: ensures reproducibility of the split
X_train, X_test, Y_train, Y_test = train_test_split(util1, util2, test_size = 0.3, random_state = 42)

In [135]:
# Define a function for model evaluation
def evaluate_model(model, X_train, Y_train, X_test, Y_test):
    model.fit(X_train, Y_train)  # Train the model
    Y_pred = model.predict(X_test)  # Predict on test data
    
    # Calculate accuracy metric
    acc_score = accuracy_score(Y_test, Y_pred)
    
    # Print evaluation results
    print(f"\n Accuracy score: {acc_score}")
    print(classification_report(Y_test, Y_pred))


# Models dictionary
models_util = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree Classifier": DecisionTreeClassifier()

}

# Iterate over each model in the dictionary and evaluate
for model_name, classifier in models_util.items():
    print(f"\n{model_name}")
    evaluate_model(classifier, X_train, Y_train, X_test, Y_test)


Logistic Regression

 Accuracy score: 0.9471830985915493
              precision    recall  f1-score   support

           0       0.93      0.96      0.95       140
           1       0.96      0.93      0.95       144

    accuracy                           0.95       284
   macro avg       0.95      0.95      0.95       284
weighted avg       0.95      0.95      0.95       284


Decision Tree Classifier

 Accuracy score: 0.8732394366197183
              precision    recall  f1-score   support

           0       0.93      0.81      0.86       140
           1       0.83      0.94      0.88       144

    accuracy                           0.87       284
   macro avg       0.88      0.87      0.87       284
weighted avg       0.88      0.87      0.87       284



## Oversampling

In [136]:
# Drop the 'Class' column from the df DataFrame and assign the result to util3 (features)
util3 = df.drop('Class', axis=1)

# Assign the 'Class' column to util4 (target variable)
util4 = df['Class']

In [137]:
# Display the dimensions (shape) of the util1 DataFrame
print(util3.shape)

(275663, 29)


In [138]:
# Display the dimensions (shape) of the util2 DataFrame
print(util4.shape)

(275663,)


In [139]:
# Generate a synthetic dataset for classification
# n_samples: total number of samples
# n_features: number of features
# random_state: seed for reproducibility
util3, util4 = make_classification(n_samples=275663, n_features=29, random_state=42)

# Resampling the dataset to balance the classes

# X_resampled and Y_resampled will hold the resampled features and labels, respectively.
# We are using the resample function from sklearn.utils to perform the resampling.

X_resampled, Y_resampled = resample(
    util3[util4 == 1],     # Selecting features where the label is 1
    util4[util4 == 1],     # Selecting labels where the label is 1
    replace=True,          # Allowing resampling with replacement
    n_samples=util3[util4 == 0].shape[0],  # Number of samples to match the count of the other class (label 0)
    random_state=42        # Setting a random seed for reproducibility
)


# Combine the resampled minority class with the original majority class
X_resampled = np.vstack((util3[util4 == 0], X_resampled))
Y_resampled = np.hstack((util4[util4 == 0], Y_resampled))


In [140]:
# Split the resampled data into training and testing sets
# X_resampled: features
# Y_resampled: target variable
# test_size=0.3: 30% of the data will be used for testing, and 70% for training
# random_state=42: ensures reproducibility of the split
X_train_1, X_test_1, Y_train_1, Y_test_1 = train_test_split(X_resampled, Y_resampled, test_size=0.3, random_state=42)

In [141]:
# Define a function for model evaluation
def evaluate_model(model, X_train, Y_train, X_test, Y_test):
    model.fit(X_train, Y_train)  # Train the model
    Y_pred = model.predict(X_test)  # Predict on test data
    
    # Calculate accuracy metric
    acc_score = accuracy_score(Y_test, Y_pred)
    
    # Print evaluation results
    print(f"\n Accuracy score: {acc_score}")
    print(classification_report(Y_test, Y_pred))


# Models dictionary
models_util = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
}

# Iterate over each model in the dictionary and evaluate
for model_name, classifier in models_util.items():
    print(f"\n{model_name}")
    evaluate_model(classifier, X_train_1, Y_train_1, X_test_1, Y_test_1)


Logistic Regression

 Accuracy score: 0.9196503065259187
              precision    recall  f1-score   support

           0       0.93      0.91      0.92     41069
           1       0.91      0.93      0.92     41632

    accuracy                           0.92     82701
   macro avg       0.92      0.92      0.92     82701
weighted avg       0.92      0.92      0.92     82701


Decision Tree Classifier

 Accuracy score: 0.9056843327166539
              precision    recall  f1-score   support

           0       0.92      0.89      0.90     41069
           1       0.89      0.92      0.91     41632

    accuracy                           0.91     82701
   macro avg       0.91      0.91      0.91     82701
weighted avg       0.91      0.91      0.91     82701



## Selection

In [142]:
'''Training the final model with oversampling instead of undersampling offers several benefits:

    Preserves all majority class information.
    Provides a more balanced and larger dataset.
    Reduces the risk of overfitting.
    Enhances the model's ability to generalize and detect minority class instances.'''

# Create a Logistic Regression object
LR = LogisticRegression()

# Train the Decision Tree Classifier on the resampled data
LR.fit(X_train_1, Y_train_1)

## Prediction

In [143]:
# Sample input for prediction
sample_input = [[1.09552460629831,-0.116085345733392,1.39791191546724,1.49754690019101,-1.0491239994212,0.0728385348036489,-0.723802080255301,0.287532472980736,0.996326812363796,-0.149144652839117,-0.508291668719983,-0.0278409029803608,-1.32155975244457,-0.0624044666639764,0.505750535601976,-0.209298470605219,0.207536533977023,-0.471334706912941,-0.736297582236995,-0.275034578640693,-0.0332341723097201,0.0932618845502939,0.0894983953019386,0.361261297686165,0.250962947610691,-0.378279979683085,0.0810238996896179,0.0342269381560285,2.09]]

# Predict using the Decision Tree Classifier
prediction = LR.predict(sample_input)

# Print the predicted class
print(prediction[0])

0


In [144]:
# Check if the predicted class is not equal to 0 
print("Fraud Transaction" if prediction[0] != 0 else "Normal Transaction")

Normal Transaction
