Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# loading the dataset to a Pandas DataFrame
data = pd.read_csv('C:/Users/Asus/OneDrive/Desktop/Credit Fraud/Credit_Card_Fraud_Detection_Dataset.csv')

In [3]:
data.columns

Index(['distance_from_home', 'distance_from_last_transaction',
       'ratio_to_median_purchase_price', 'repeat_retailer', 'used_chip',
       'used_pin_number', 'online_order', 'class'],
      dtype='object')

In [4]:
# first 5 rows of the dataset
data.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,class
0,57.877857,0.31114,1.94594,1,1,0,0,0
1,10.829943,0.175592,1.294219,1,0,0,0,0
2,5.091079,0.805153,0.427715,1,0,0,1,0
3,2.247564,5.600044,0.362663,1,1,0,1,0
4,44.190936,0.566486,2.222767,1,1,0,1,0


In [5]:
data.tail()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,class
999995,2.207101,0.112651,1.626798,1,1,0,0,0
999996,19.872726,2.683904,2.778303,1,1,0,0,0
999997,2.914857,1.472687,0.218075,1,1,0,1,0
999998,4.258729,0.242023,0.475822,1,0,0,1,0
999999,58.108125,0.31811,0.38692,1,1,0,1,0


In [6]:
# dataset informations
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   distance_from_home              1000000 non-null  float64
 1   distance_from_last_transaction  1000000 non-null  float64
 2   ratio_to_median_purchase_price  1000000 non-null  float64
 3   repeat_retailer                 1000000 non-null  int64  
 4   used_chip                       1000000 non-null  int64  
 5   used_pin_number                 1000000 non-null  int64  
 6   online_order                    1000000 non-null  int64  
 7   class                           1000000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 61.0 MB


In [7]:
# checking the number of missing values in each column
data.isnull().sum()

distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
repeat_retailer                   0
used_chip                         0
used_pin_number                   0
online_order                      0
class                             0
dtype: int64

In [8]:
# distribution of legit transactions & fraudulent transactions
data['class'].value_counts()

class
0    912597
1     87403
Name: count, dtype: int64

This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [9]:
# Separating the data for analysis
# Assuming the 'class' column denotes whether the transaction is legitimate (0) or fraudulent (1)
legit = data[data['class'] == 0]
fraud = data[data['class'] == 1]


In [10]:
# Displaying the first few rows of each DataFrame to ensure they were separated correctly
print("Legitimate Transactions:\n", legit.head())
print("\nFraudulent Transactions:\n", fraud.head())


Legitimate Transactions:
    distance_from_home  distance_from_last_transaction  \
0           57.877857                        0.311140   
1           10.829943                        0.175592   
2            5.091079                        0.805153   
3            2.247564                        5.600044   
4           44.190936                        0.566486   

   ratio_to_median_purchase_price  repeat_retailer  used_chip  \
0                        1.945940                1          1   
1                        1.294219                1          0   
2                        0.427715                1          0   
3                        0.362663                1          1   
4                        2.222767                1          1   

   used_pin_number  online_order  class  
0                0             0      0  
1                0             0      0  
2                0             1      0  
3                0             1      0  
4                0            

In [11]:
# Optionally, displaying statistical measures of a specific column for both datasets
print("\nStatistical measures for legitimate transactions (distance_from_home):")
print(legit['distance_from_home'].describe())




Statistical measures for legitimate transactions (distance_from_home):
count    912597.000000
mean         22.832976
std          52.828655
min           0.004874
25%           3.828942
50%           9.673847
75%          24.158057
max        8777.136420
Name: distance_from_home, dtype: float64


In [12]:
print("\nStatistical measures for fraudulent transactions (distance_from_home):")
print(fraud['distance_from_home'].describe())


Statistical measures for fraudulent transactions (distance_from_home):
count    87403.000000
mean        66.261876
std        134.391608
min          0.025847
25%          4.585729
50%         15.454219
75%        101.110104
max      10632.723670
Name: distance_from_home, dtype: float64


In [13]:
print(legit.shape)
print(fraud.shape)

(912597, 8)
(87403, 8)


In [14]:
# statistical measures of the data
legit.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,class
count,912597.0,912597.0,912597.0,912597.0,912597.0,912597.0,912597.0,912597.0
mean,22.832976,4.301391,1.423642,0.881672,0.359402,0.109944,0.622225,0.0
std,52.828655,22.472359,1.946152,0.322997,0.479825,0.312821,0.484831,0.0
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.828942,0.293859,0.449768,1.0,0.0,0.0,0.0,0.0
50%,9.673847,0.985074,0.91395,1.0,0.0,0.0,1.0,0.0
75%,24.158057,3.268578,1.788918,1.0,1.0,0.0,1.0,0.0
max,8777.13642,11851.10456,267.802942,1.0,1.0,1.0,1.0,0.0


In [15]:
fraud.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,class
count,87403.0,87403.0,87403.0,87403.0,87403.0,87403.0,87403.0,87403.0
mean,66.261876,12.712185,6.006323,0.880119,0.256399,0.003123,0.946318,1.0
std,134.391608,47.997697,5.56432,0.324825,0.436647,0.055801,0.225391,0.0
min,0.025847,0.000407,0.011966,0.0,0.0,0.0,0.0,1.0
25%,4.585729,0.328199,3.50027,1.0,0.0,0.0,1.0,1.0
50%,15.454219,1.157631,5.071294,1.0,0.0,0.0,1.0,1.0
75%,101.110104,4.598504,7.331222,1.0,1.0,0.0,1.0,1.0
max,10632.72367,2160.499922,266.689692,1.0,1.0,1.0,1.0,1.0


In [16]:
# compare the values for both transactions
data.groupby('class').mean()

Unnamed: 0_level_0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,22.832976,4.301391,1.423642,0.881672,0.359402,0.109944,0.622225
1,66.261876,12.712185,6.006323,0.880119,0.256399,0.003123,0.946318


It seems like your original dataset is heavily imbalanced, with 87403 legitimate transactions and only 1 fraudulent transaction.

To create a synthetic dataset with a similar distribution, we can generate a dataset with a large number of legitimate transactions and a small number of fraudulent transactions. 

Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

In [17]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Define mean and standard deviation for each feature
legit_mean = [20, 5, 1.5, 0.9, 0.4, 0.1, 0.6]  # Example mean values for legitimate transactions
fraud_mean = [60, 15, 5, 0.9, 0.2, 0.03, 0.9]  # Example mean values for fraudulent transactions
std_dev = [5, 2, 1, 0.1, 0.1, 0.05, 0.2]  # Example standard deviation for both classes

# Number of samples for each class
num_samples = 1000

# Generate synthetic data for legitimate transactions
legit_data = pd.DataFrame()
for i, mean in enumerate(legit_mean):
    legit_data['feature{}'.format(i)] = np.random.normal(mean, std_dev[i], num_samples)

legit_data['class'] = 0  # Add class label for legitimate transactions

# Generate synthetic data for fraudulent transactions
fraud_data = pd.DataFrame()
for i, mean in enumerate(fraud_mean):
    fraud_data['feature{}'.format(i)] = np.random.normal(mean, std_dev[i], num_samples)

fraud_data['class'] = 1  # Add class label for fraudulent transactions

# Combine both datasets
data = pd.concat([legit_data, fraud_data], ignore_index=True)

# Shuffle the dataset
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Display class distribution
print("Class distribution:")
print(data['class'].value_counts())
print("\nSample data:")
print(data.head())


Class distribution:
class
1    1000
0    1000
Name: count, dtype: int64

Sample data:
    feature0   feature1  feature2  feature3  feature4  feature5  feature6  \
0  70.270969  15.591866  5.605405  0.885036  0.242136  0.111998  1.156070   
1  19.200307   6.518310  0.134044  0.907716  0.232711  0.158113  0.746895   
2  66.977170  16.424964  5.871703  0.884967  0.310039  0.063534  1.376920   
3  20.232183   5.895434  2.250579  0.867797  0.492825  0.080602  0.657513   
4  59.922185  11.187527  5.891701  0.824883  0.394896 -0.000628  0.979801   

   class  
0      1  
1      0  
2      1  
3      0  
4      1  


In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
np.random.seed(42)

# Define mean and standard deviation for each feature
legit_mean = [20, 5, 1.5, 0.9, 0.4, 0.1, 0.6]  # Example mean values for legitimate transactions
fraud_mean = [60, 15, 5, 0.9, 0.2, 0.03, 0.9]  # Example mean values for fraudulent transactions
std_dev = [5, 2, 1, 0.1, 0.1, 0.05, 0.2]  # Example standard deviation for both classes

# Number of samples for each class
num_samples = 1000

# Generate synthetic data for legitimate transactions
legit_data = pd.DataFrame()
for i, mean in enumerate(legit_mean):
    legit_data['feature{}'.format(i)] = np.random.normal(mean, std_dev[i], num_samples)

legit_data['class'] = 0  # Add class label for legitimate transactions

# Generate synthetic data for fraudulent transactions
fraud_data = pd.DataFrame()
for i, mean in enumerate(fraud_mean):
    fraud_data['feature{}'.format(i)] = np.random.normal(mean, std_dev[i], num_samples)

fraud_data['class'] = 1  # Add class label for fraudulent transactions

# Combine both datasets
data = pd.concat([legit_data, fraud_data], ignore_index=True)

# Shuffle the dataset
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Display class distribution
print("Class distribution:")
print(data['class'].value_counts())
print("\nSample data:")
print(data.head())

# Splitting the data into Features (X) and Target (Y)
X = data.drop(columns='class', axis=1)
Y = data['class']

# Splitting the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Display the shape of the training and testing sets to verify the split
print("\nTraining Features Shape:", X_train.shape)
print("Training Labels Shape:", Y_train.shape)
print("Testing Features Shape:", X_test.shape)
print("Testing Labels Shape:", Y_test.shape)

# Display the distribution of classes in the training and testing sets to ensure stratification
print("\nClass distribution in training set:\n", Y_train.value_counts())
print("\nClass distribution in testing set:\n", Y_test.value_counts())


Class distribution:
class
1    1000
0    1000
Name: count, dtype: int64

Sample data:
    feature0   feature1  feature2  feature3  feature4  feature5  feature6  \
0  70.270969  15.591866  5.605405  0.885036  0.242136  0.111998  1.156070   
1  19.200307   6.518310  0.134044  0.907716  0.232711  0.158113  0.746895   
2  66.977170  16.424964  5.871703  0.884967  0.310039  0.063534  1.376920   
3  20.232183   5.895434  2.250579  0.867797  0.492825  0.080602  0.657513   
4  59.922185  11.187527  5.891701  0.824883  0.394896 -0.000628  0.979801   

   class  
0      1  
1      0  
2      1  
3      0  
4      1  

Training Features Shape: (1600, 7)
Training Labels Shape: (1600,)
Testing Features Shape: (400, 7)
Testing Labels Shape: (400,)

Class distribution in training set:
 class
0    800
1    800
Name: count, dtype: int64

Class distribution in testing set:
 class
0    200
1    200
Name: count, dtype: int64


In [19]:
print(X.shape, X_train.shape, X_test.shape)

(2000, 7) (1600, 7) (400, 7)


Model Training

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, Y_train)

# Make predictions
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(Y_train, train_preds)
test_accuracy = accuracy_score(Y_test, test_preds)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

# Classification report on testing data
print("\nClassification Report:")
print(classification_report(Y_test, test_preds))


Training Accuracy: 1.0
Testing Accuracy: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       200
           1       1.00      1.00      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [21]:
# Select a few samples from the test set for prediction
samples_to_predict = X_test[:5]  # Select the first 5 samples for prediction

# Make predictions
predictions = model.predict(samples_to_predict)

# Display the predicted class for each sample
for i, pred in enumerate(predictions):
    print("Sample", i+1, "Predicted Class:", pred)


Sample 1 Predicted Class: 0
Sample 2 Predicted Class: 0
Sample 3 Predicted Class: 0
Sample 4 Predicted Class: 0
Sample 5 Predicted Class: 1


In [22]:
import pickle

# Assuming 'model' is your trained model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)


In [23]:
from nbconvert import HTMLExporter

# Load the notebook
notebook_filename = "Credit_Card_Fraud_Detection.ipynb"
with open(notebook_filename) as f:
    nb = nbformat.read(f, as_version=4)

# Create an HTML exporter
html_exporter = HTMLExporter()

# Convert the notebook to HTML
html_data, _ = html_exporter.from_notebook_node(nb)

# Write the HTML data to a file
html_filename = "Credit_Card_Fraud_Detection.html"
with open(html_filename, "w") as f:
    f.write(html_data)
    
print("HTML created successfully:", html_filename)


NameError: name 'nbformat' is not defined