In [1]:
### dependencies ###
import fireducks.pandas as pd
import numpy as np
from math import ceil
import plotly.graph_objects as go

In [2]:
fraud_df = pd.read_csv('data/Base.csv')

In [3]:
### Checking for missing values ###
def missing_values(dataset):
    missing_values = False
    for feature in dataset.columns:
        s = dataset[feature]
        if s.dtype == 'object': 
            # string
            n_missing_values = s.isnull().sum() + (s == "").sum()
        elif feature in ('prev_address_months_count','current_address_months_count'):
            # special cases
            n_missing_values = s.isnull().sum() + (s == -1).sum()
        else:
            # numerical types
            n_missing_values = s.isnull().sum()
        if n_missing_values != 0:
            missing_values = True
        print(f"Feature {feature} lacks {n_missing_values} values")
    if missing_values:
        print("Values are missing, please check details !")
    else:
        print("No missing values, great !")

missing_values(fraud_df) # more than 71% of missing values for 'prev_address_months_count' !

Feature fraud_bool lacks 0 values
Feature income lacks 0 values
Feature name_email_similarity lacks 0 values
Feature prev_address_months_count lacks 712920 values
Feature current_address_months_count lacks 4254 values
Feature customer_age lacks 0 values
Feature days_since_request lacks 0 values
Feature intended_balcon_amount lacks 0 values
Feature payment_type lacks 0 values
Feature zip_count_4w lacks 0 values
Feature velocity_6h lacks 0 values
Feature velocity_24h lacks 0 values
Feature velocity_4w lacks 0 values
Feature bank_branch_count_8w lacks 0 values
Feature date_of_birth_distinct_emails_4w lacks 0 values
Feature employment_status lacks 0 values
Feature credit_risk_score lacks 0 values
Feature email_is_free lacks 0 values
Feature housing_status lacks 0 values
Feature phone_home_valid lacks 0 values
Feature phone_mobile_valid lacks 0 values
Feature bank_months_count lacks 0 values
Feature has_other_cards lacks 0 values
Feature proposed_credit_limit lacks 0 values
Feature foreign_

In [4]:
### Handling Missing Values

fraud_df_mv = fraud_df.drop("prev_address_months_count",axis=1)
m = ceil(np.mean(fraud_df_mv['current_address_months_count']))
fraud_df_mv.loc[fraud_df_mv['current_address_months_count']==-1,'current_address_months_count'] = m
missing_values(fraud_df_mv)

Feature fraud_bool lacks 0 values
Feature income lacks 0 values
Feature name_email_similarity lacks 0 values
Feature current_address_months_count lacks 0 values
Feature customer_age lacks 0 values
Feature days_since_request lacks 0 values
Feature intended_balcon_amount lacks 0 values
Feature payment_type lacks 0 values
Feature zip_count_4w lacks 0 values
Feature velocity_6h lacks 0 values
Feature velocity_24h lacks 0 values
Feature velocity_4w lacks 0 values
Feature bank_branch_count_8w lacks 0 values
Feature date_of_birth_distinct_emails_4w lacks 0 values
Feature employment_status lacks 0 values
Feature credit_risk_score lacks 0 values
Feature email_is_free lacks 0 values
Feature housing_status lacks 0 values
Feature phone_home_valid lacks 0 values
Feature phone_mobile_valid lacks 0 values
Feature bank_months_count lacks 0 values
Feature has_other_cards lacks 0 values
Feature proposed_credit_limit lacks 0 values
Feature foreign_request lacks 0 values
Feature source lacks 0 values
Feat

In [5]:
#fraud_df.dtypes

In [6]:
fraud_df = pd.DataFrame(fraud_df)

In [7]:
### Class Imbalance
nf,f = fraud_df_mv['fraud_bool'].value_counts()
n = len(fraud_df)
(n - f)/n # less than 2% of "fraud" class

0.988971

In [8]:
### Balanced dataset
fraud_df_mv_fraud = fraud_df_mv[fraud_df_mv['fraud_bool'] == 1]
n_over_sample = len(fraud_df_mv[fraud_df_mv['fraud_bool'] == 0]) - len(fraud_df_mv_fraud)
oversampled_fraud = fraud_df_mv_fraud.sample(n=n_over_sample, replace=True)
fraud_df_mv_b = pd.concat([fraud_df_mv,oversampled_fraud])

fraud_df_mv_b['fraud_bool'].value_counts() # balanced dataset

fraud_bool
0    988971
1    988971
Name: count, dtype: int64

In [9]:
### Correlation
def display_corr_matrix(dataset):
    corr = dataset.select_dtypes(exclude=['object']).corr()
    mask = np.tril(np.ones(corr.shape)).astype(bool)
    corr = corr.where(~mask)
    
    trace = go.Heatmap(z=corr.values,
                       x=corr.index.values,
                       y=corr.columns.values)

    fig = go.Figure()
    fig.add_trace(trace)
    fig.update_layout(
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis=dict(showgrid=False),
        yaxis=dict(showgrid=False)
    )
    fig.show()

display_corr_matrix(fraud_df_mv_b)

In [10]:
def display_corr_distribution(dataset, title="Correlation Value Distribution"):

    corr = dataset.select_dtypes(exclude=['object']).corr()
    mask = np.tril(np.ones(corr.shape)).astype(bool)

    corr_values = corr.where(~mask).stack() # flatten trilled matrix
    corr_value_counts = corr_values.value_counts().sort_index()

    trace = go.Bar(
        x=corr_value_counts.index,
        y=corr_value_counts.values
    )

    fig = go.Figure()
    fig.add_trace(trace)
    fig.update_layout(
        title=title,
        xaxis_title="Correlation Value",
        yaxis_title="Frequency",
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis=dict(
            showgrid=False,
            tickvals=np.round(corr_value_counts.index, 2),  # Set tick values with improved precision
            tickformat=".2f"  # Format ticks to 2 decimal places
        ),
        yaxis=dict(showgrid=False)
    )
    fig.show()

display_corr_distribution(fraud_df_mv_b)

In [11]:
def remove_high_correlations(dataframe, threshold, target):
    corr = dataframe.select_dtypes(exclude=['object']).corr()
    highly_correlated_pairs = []
    removed_columns = []
    
    # Obtenir l'index de la colonne cible
    target_index = dataframe.columns.get_loc(target)
    
    for i in range(len(corr.columns)):
        for j in range(i):
            if abs(corr.iloc[i, j]) > threshold:
                colname_i = corr.columns[i]
                colname_j = corr.columns[j]
                highly_correlated_pairs.append((colname_i, colname_j))
    
    for (col1, col2) in highly_correlated_pairs:
        if col1 in dataframe.columns and col2 in dataframe.columns:
            corr_col1_target = abs(corr.iloc[corr.columns.get_loc(col1), target_index])
            corr_col2_target = abs(corr.iloc[corr.columns.get_loc(col2), target_index])
            
            if corr_col1_target > corr_col2_target:
                removed_columns.append(col1)
                dataframe = dataframe.drop(columns=[col1])
            else:
                removed_columns.append(col2)
                dataframe = dataframe.drop(columns=[col2])
    for col in removed_columns : 
        print(f"Column {col} removed !")
    return dataframe

In [12]:
### Removing highly correlated features
corr_threshold = 0.7
fraud_df_mv_b_nc = remove_high_correlations(fraud_df_mv_b, corr_threshold, 'fraud_bool')

Column month removed !


In [13]:
fraud_df_mv_b_nc.columns

Index(['fraud_bool', 'income', 'name_email_similarity',
       'current_address_months_count', 'customer_age', 'days_since_request',
       'intended_balcon_amount', 'payment_type', 'zip_count_4w', 'velocity_6h',
       'velocity_24h', 'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'device_fraud_count'],
      dtype='object')

In [14]:
display_corr_distribution(fraud_df_mv_b_nc)

In [15]:
### (Optional) Save as a CSV file
fraud_df_mv_b_nc.to_csv('data/fraud_df_mv_b_nc.csv',index=False)