In [1]:
# Bank Transactions 
# This dataset simulates transactional activities from a digital banking platform. It includes detailed information for 
# each transaction such as amount, location, customerAge, Login Attempts, etc.

# File : bank_transactions_data_2.csv
# shape : 

# Goal of this project is to build a machine learning model to detect fraudulent transactions by analyzing several 
# underlying factors like transactions, demographics and user-behaviour.

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import logging
import os
log = logging.getLogger('Exploratory_Data_Analysis')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s : %(message)s', datefmt='%H:%M:%S')

In [3]:
def load_data(filename: str = '../data/bank_transactions_data_2.csv'):
    try:
        if os.path.exists(filename):
            df = pd.read_csv(filename)
            log.info('Data has successfully been loaded')
        else:
            log.error('File Not Found')
        return df
    except FileNotFoundError as e:
        log.exception('File Not Found: ',e)
        return None

In [4]:
# ---Decriptive Summary of the dataset----
def descriptive_overview(df: pd.DataFrame):
    if df is not None:
        log.info(f'Number of observations {df.shape[0]}')
        log.info(f'Number of features : {df.shape[1]}\n')
        display(df.describe(include='all').T)
        return df.describe(include='all').T
    else:
        log.warning('DataFrame is empty!')

In [5]:
# ---Analysis of the numerical columns---
def numeric_cols_summary(df: pd.DataFrame):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    log.info(f'| Number of numeric columns : {len(numeric_cols)} | Examples : {numeric_cols[:3]}\n')
    for i,col in enumerate(numeric_cols,1):
        log.info(f'{i} {col:<24} | Min : {df[col].min():<15} | Max : {df[col].max():<10}')
    return numeric_cols

In [6]:
# ----Analysis of the categorical columns-----
def category_cols_summary(df: pd.DataFrame):
    category_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
    log.info(f'\n| Number of categorical columns : {len(category_cols)} | Examples : {category_cols[:3]}\n')
    for i,col in enumerate(category_cols,1):
        uniques = df[col].unique()
        log.info(f'{i:<2}. {col:<25} |Unique : {df[col].nunique():<7} | Examples : {uniques[:3]}')
    return category_cols

In [7]:
# --------check for duplicates------
def duplicate_data(df: pd.DataFrame):
    duplicates = df[df.duplicated()]
    log.info(f'\nNumber of duplicates : {len(duplicates)}')
    if len(duplicates) == 0:
        log.info(f'No duplicates found in the data')
    else:
        display(duplicates)
        return duplicates

In [8]:
def run_eda(filename: str = '../data/bank_transactions_data_2.csv'):
    df = load_data()
    overview = descriptive_overview(df)
    num_cols = numeric_cols_summary(df)
    cat_cols = category_cols_summary(df)
    duplicates = duplicate_data(df)
    return {
        'data' : df,
        'overview' : overview,
        'num_cols' : num_cols,
        'cat_cols' : cat_cols,
        'duplicate' : duplicates
    }
if __name__ == '__main__':
    results = run_eda()
    df = results['data']

14:23:05 - INFO : Data has successfully been loaded
14:23:05 - INFO : Number of observations 2512
14:23:05 - INFO : Number of features : 16



Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
TransactionID,2512.0,2512.0,TX000001,1.0,,,,,,,
AccountID,2512.0,495.0,AC00460,12.0,,,,,,,
TransactionAmount,2512.0,,,,297.593778,291.946243,0.26,81.885,211.14,414.5275,1919.11
TransactionDate,2512.0,2512.0,2023-04-11 16:29:14,1.0,,,,,,,
TransactionType,2512.0,2.0,Debit,1944.0,,,,,,,
Location,2512.0,43.0,Fort Worth,70.0,,,,,,,
DeviceID,2512.0,681.0,D000548,9.0,,,,,,,
IP Address,2512.0,592.0,200.136.146.93,13.0,,,,,,,
MerchantID,2512.0,100.0,M026,45.0,,,,,,,
Channel,2512.0,3.0,Branch,868.0,,,,,,,


14:23:05 - INFO : | Number of numeric columns : 5 | Examples : ['TransactionAmount', 'CustomerAge', 'TransactionDuration']

14:23:05 - INFO : 1 TransactionAmount        | Min : 0.26            | Max : 1919.11   
14:23:05 - INFO : 2 CustomerAge              | Min : 18              | Max : 80        
14:23:05 - INFO : 3 TransactionDuration      | Min : 10              | Max : 300       
14:23:05 - INFO : 4 LoginAttempts            | Min : 1               | Max : 5         
14:23:05 - INFO : 5 AccountBalance           | Min : 101.25          | Max : 14977.99  
14:23:06 - INFO : 
| Number of categorical columns : 11 | Examples : ['TransactionID', 'AccountID', 'TransactionDate']

14:23:06 - INFO : 1 . TransactionID             |Unique : 2512    | Examples : ['TX000001' 'TX000002' 'TX000003']
14:23:06 - INFO : 2 . AccountID                 |Unique : 495     | Examples : ['AC00128' 'AC00455' 'AC00019']
14:23:06 - INFO : 3 . TransactionDate           |Unique : 2512    | Examples : ['2023-04-11