# **1.0: Import Libraries**

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# set columns and rows to show all in the output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# **2.0: Load the dataset**

In [4]:
df = pd.read_csv('creditcard.csv')

# **3.0: Data Exploration**

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [6]:
print(f'No of columns in the dataset: {df.shape[1]}\nNo of rows in the dataset: {df.shape[0]}')

No of columns in the dataset: 31
No of rows in the dataset: 284807


In [7]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [8]:
df.dtypes

Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object

# **4.0: Checking Missing values**

In [9]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

# **5.0: Preprocess the data**
> **Step 5.1: Separate features and target**

> **Step 5.2: Split the data into training and testing sets**

> **Step 5.3: Scale the features**

In [10]:
# Step 5.1: Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Step 5.2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Step 5.3: Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# **6.0: Train ensemble models**
> **Step 6.1: Train Random Forest model**

> **Step 6.2: Train Gradient Boosting model**

> **Step 6.3: Train XGBoost model**

In [12]:
# Step 6.1: Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)


In [13]:
# Step 6.2: Train Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train_scaled, y_train)

In [14]:
# Step 6.3: Train XGBoost
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# **7.0: Evaluate models**
> **Step 7.1: Make predictions and evaluate Random Forest**

> **Step 7.2: Make predictions and evaluate Gradient Boosting**

> **Step 7.3: Make predictions and evaluate XGBoost**

In [15]:
# Step 7.1: Make predictions and evaluate Random Forest
y_pred_rf = rf_model.predict(X_test_scaled)
report_rf = classification_report(y_test, y_pred_rf, output_dict=True)
cm_rf = confusion_matrix(y_test, y_pred_rf)

# Display evaluation results for Random Forest
print("\nEvaluating Random Forest:")
print("Classification Report:")
print(pd.DataFrame(report_rf).transpose())
print("\nConfusion Matrix:")
print(cm_rf)

# Step 7.2: Make predictions and evaluate Gradient Boosting
y_pred_gb = gb_model.predict(X_test_scaled)
report_gb = classification_report(y_test, y_pred_gb, output_dict=True)
cm_gb = confusion_matrix(y_test, y_pred_gb)

# Display evaluation results for Gradient Boosting
print("\nEvaluating Gradient Boosting:")
print("Classification Report:")
print(pd.DataFrame(report_gb).transpose())
print("\nConfusion Matrix:")
print(cm_gb)

# Step 7.3: Make predictions and evaluate XGBoost
y_pred_xgb = xgb_model.predict(X_test_scaled)
report_xgb = classification_report(y_test, y_pred_xgb, output_dict=True)
cm_xgb = confusion_matrix(y_test, y_pred_xgb)

# Display evaluation results for XGBoost
print("\nEvaluating XGBoost:")
print("Classification Report:")
print(pd.DataFrame(report_xgb).transpose())
print("\nConfusion Matrix:")
print(cm_xgb)


Evaluating Random Forest:
Classification Report:
              precision    recall  f1-score       support
0              0.999596  0.999965  0.999780  56864.000000
1              0.974026  0.765306  0.857143     98.000000
accuracy       0.999561  0.999561  0.999561      0.999561
macro avg      0.986811  0.882635  0.928462  56962.000000
weighted avg   0.999552  0.999561  0.999535  56962.000000

Confusion Matrix:
[[56862     2]
 [   23    75]]

Evaluating Gradient Boosting:
Classification Report:
              precision    recall  f1-score       support
0              0.999314  0.999631  0.999473  56864.000000
1              0.737500  0.602041  0.662921     98.000000
accuracy       0.998947  0.998947  0.998947      0.998947
macro avg      0.868407  0.800836  0.831197  56962.000000
weighted avg   0.998864  0.998947  0.998893  56962.000000

Confusion Matrix:
[[56843    21]
 [   39    59]]

Evaluating XGBoost:
Classification Report:
              precision    recall  f1-score       suppor

# **8.0: feature importance for each model**

In [16]:
# Step 8: Print feature importance for each model

# For Random Forest
feature_importance_rf = pd.DataFrame({'feature': df.columns[:-1], 'importance': rf_model.feature_importances_})
feature_importance_rf = feature_importance_rf.sort_values('importance', ascending=False).head(10)
print("\nTop 10 Feature Importance for Random Forest:")
print(feature_importance_rf)

# For Gradient Boosting
feature_importance_gb = pd.DataFrame({'feature': df.columns[:-1], 'importance': gb_model.feature_importances_})
feature_importance_gb = feature_importance_gb.sort_values('importance', ascending=False).head(10)
print("\nTop 10 Feature Importance for Gradient Boosting:")
print(feature_importance_gb)

# For XGBoost
feature_importance_xgb = pd.DataFrame({'feature': df.columns[:-1], 'importance': xgb_model.feature_importances_})
feature_importance_xgb = feature_importance_xgb.sort_values('importance', ascending=False).head(10)
print("\nTop 10 Feature Importance for XGBoost:")
print(feature_importance_xgb)


Top 10 Feature Importance for Random Forest:
   feature  importance
17     V17    0.157500
12     V12    0.134876
14     V14    0.124647
10     V10    0.081051
16     V16    0.074765
11     V11    0.057897
9       V9    0.034005
7       V7    0.029703
18     V18    0.028295
4       V4    0.026814

Top 10 Feature Importance for Gradient Boosting:
   feature  importance
14     V14    0.355593
27     V27    0.165550
10     V10    0.142137
17     V17    0.141303
11     V11    0.070781
7       V7    0.034739
12     V12    0.016714
21     V21    0.014848
28     V28    0.014792
1       V1    0.012869

Top 10 Feature Importance for XGBoost:
   feature  importance
14     V14    0.296522
7       V7    0.177108
10     V10    0.081132
12     V12    0.038365
16     V16    0.032918
4       V4    0.030140
17     V17    0.027492
27     V27    0.026563
29  Amount    0.019393
1       V1    0.018287


In [18]:
import pickle
# Save the models and scaler
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open('gradient_boosting_model.pkl', 'wb') as f:
    pickle.dump(gb_model, f)

with open('xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Models and scaler saved successfully.")

Models and scaler saved successfully.
