In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
def OutlierDetector(data):
    '''Detects outliers in a given pandas Series using the IQR method.
    Args:
        data (pd.Series): The input data series to check for outliers.
    Returns:
        pd.Series: A series containing the outliers.
    Raises:
        ValueError: If the input data is not a pandas Series.   
    
    '''
    
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    iqr = q3 - q1
    lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
    return data[(data < lower) | (data > upper)]


In [3]:
def DefaultFlagGenerator(df):
    default_flag = (
        (df['payment_delinquency_count'] >= 3).astype(int) +
        (df['over_indebtedness_flag'] == 1).astype(int) +
        (df['financial_stress_score'] >= 9).astype(int) +
        (df['bnpl_debt_ratio'] >= 1.8).astype(int) +
        (df['credit_limit_utilisation'] >= 95).astype(int)
        ) >= 3 # Must meet at least 3 of the 5 conditions
    df['default_flag'] = default_flag.astype(int)
    return df

In [4]:
def Scaler(data, continuous_cols, binary_cols):
    scaler = StandardScaler()
   # Scale the continuous columns
    scaled_continuous = scaler.fit_transform(data[continuous_cols])
    scaled_continuous_df = pd.DataFrame(scaled_continuous, columns=continuous_cols, index=data.index)
    
    # Combine binary columns and scaled continuous columns
    result = pd.concat([data[binary_cols], scaled_continuous_df], axis=1)
    
    return result

In [None]:
#open file bnpl
data = pd.read_csv('bnpl.csv')
data.info()


In [None]:
for column in data.select_dtypes(include=['float64', 'int64']).columns:
    outliers = OutlierDetector(data[column])
    print(f"Outliers in {column}:\n", outliers)


In [7]:
data = DefaultFlagGenerator(data)
data_clean = data.drop(columns=['CustomerID'])

In [None]:
binary_cols=[col for col in data_clean.columns if data_clean[col].nunique() == 2]
continuous_cols = [col for col in data_clean.columns if data_clean[col].nunique() > 2]
print("Binary columns:", binary_cols)
print("Continuous columns:", continuous_cols)

In [None]:
data_scaled = Scaler(data_clean,continuous_cols, binary_cols)
data_scaled.to_csv('bnpl_scaled.csv', index=False)
print("Data processing complete. Scaled data saved to 'bnpl_scaled.csv'.")

In [None]:
# run histograms for continuous columns
for column in continuous_cols:
    plt.figure(figsize=(10, 6))
    plt.hist(data_scaled[column], bins=30, edgecolor='black')
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.grid(axis='y', alpha=0.75)
    plt.savefig(f'histograms/{column}_histogram.png')


In [None]:
#check for outliers in each column
for column in data_scaled.select_dtypes(include=['float64', 'int64']).columns:
    outliers = OutlierDetector(data_scaled[column])
    if not outliers.empty:
        print(f"Outliers in {column}:\n", outliers)
    else:
        print(f"No outliers detected in {column}.")

In [None]:
#print boxplots for continuous columns
for column in continuous_cols:
    plt.figure(figsize=(10, 6))
    plt.boxplot(data_scaled[column])
    plt.title(f'Boxplot of {column}')
    plt.ylabel(column)
    plt.grid(axis='y', alpha=0.75)
    #save the boxplot as a png file in a folder named 'boxplots'
    #plt.savefig(f'boxplots/{column}_boxplot.png')

In [None]:
#check the balance of the binary columns
for column in binary_cols:
    balance = data_scaled[column].value_counts(normalize=True)
    print(f"Balance of {column}:\n", balance)
    plt.figure(figsize=(8, 5))
    balance.plot(kind='bar')
    plt.title(f'Balance of {column}')
    plt.xlabel(column)
    plt.ylabel('Proportion')
    plt.grid(axis='y', alpha=0.75)
    plt.savefig(f'balances/{column}_balance.png')

In [None]:
#get the correlation matrix
correlation_matrix = data_scaled.corr()
plt.figure(figsize=(12, 10))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none')
plt.colorbar()
plt.title('Correlation Matrix')
#change the x and y ticks to the column names
plt.xticks(ticks=np.arange(len(data_scaled.columns)), labels=data_scaled.columns, rotation=45, ha='right')
plt.yticks(ticks=np.arange(len(data_scaled.columns)), labels=data_scaled.columns)
plt.savefig('correlation_matrix.png')

In [None]:
#plot a scatter plot for BNPL_usage_frequency vs over_indebtedness_flag
plt.figure(figsize=(10, 6))
plt.scatter(data_scaled['bnpl_usage_frequency'], data_scaled['over_indebtedness_flag'], alpha=0.5)
plt.title('BNPL Usage Frequency vs Over Indebtedness Flag')
plt.xlabel('BNPL Usage Frequency')
plt.ylabel('Over Indebtedness Flag')
plt.grid(alpha=0.75)
plt.savefig('BNPL_usage_frequency_vs_over_indebtedness_flag.png')

In [None]:
# Initialize an empty list to store outlier indices
outlier_indices = []

# Check for outliers in the scaled data by splitting the data into two parts: one with default_flag = 0 and one with default_flag = 1
data_default_0 = data_scaled[data_scaled['default_flag'] == 0]
data_default_1 = data_scaled[data_scaled['default_flag'] == 1]

for column in continuous_cols:
    outliers_0 = OutlierDetector(data_default_0[column])
    outliers_1 = OutlierDetector(data_default_1[column])
    
    # Add indices of outliers to the list
    outlier_indices.extend(outliers_0.index.tolist())
    outlier_indices.extend(outliers_1.index.tolist())
    
    if not outliers_0.empty:
        print(f"Outliers in {column} for default_flag = 0:\n", outliers_0)
    else:
        print(f"No outliers detected in {column} for default_flag = 0.")
    
    if not outliers_1.empty:
        print(f"Outliers in {column} for default_flag = 1:\n", outliers_1)
    else:
        print(f"No outliers detected in {column} for default_flag = 1.")

# Print the list of outlier indices
print("Outlier indices:", outlier_indices)

In [18]:
#drop the outliers from the data_scaled dataframe
data_scaled_cleaned = data_scaled.drop(index=outlier_indices)

In [None]:
#plot 2 boxplots for each column one with default flag 0 and one with default flag 1
for column in continuous_cols:
    plt.figure(figsize=(10, 6))
    data_scaled_cleaned.boxplot(column=column, by='default_flag')
    plt.title(f'Boxplot of {column} by Default Flag')
    plt.suptitle('')
    plt.xlabel('Default Flag')
    plt.ylabel(column)
    plt.grid(axis='y', alpha=0.75)
    plt.savefig(f'boxplots/{column}_boxplot_by_default_flag.png')

In [None]:
#plot the correlation matrix again after removing outliers
correlation_matrix_cleaned = data_scaled_cleaned.corr()
plt.figure(figsize=(12, 10))
plt.imshow(correlation_matrix_cleaned, cmap='coolwarm', interpolation='none')
plt.colorbar()
plt.title('Correlation Matrix (Cleaned Data)')
#change the x and y ticks to the column names
plt.xticks(ticks=np.arange(len(data_scaled_cleaned.columns)), labels=data_scaled_cleaned.columns, rotation=45, ha='right')
plt.yticks(ticks=np.arange(len(data_scaled_cleaned.columns)), labels=data_scaled_cleaned.columns)
plt.savefig('correlation_matrix_cleaned.png')

#what is the numerical value of the corr of 'bnpl_usage_frequency' and 'over_indebtedness_flag'
corr_value = correlation_matrix_cleaned.loc['bnpl_usage_frequency', 'over_indebtedness_flag']
print(f"Correlation between 'bnpl_usage_frequency' and 'over_indebtedness_flag': {corr_value:.4f}")


In [None]:
#drop the over_indebtedness_flag column from the data_scaled_cleaned dataframe
data_scaled_cleaned = data_scaled_cleaned.drop(columns=['over_indebtedness_flag'])
binary_cols = [col for col in binary_cols if col != 'over_indebtedness_flag']
data_scaled_cleaned.to_csv('bnpl_scaled_cleaned.csv', index=False)
data_scaled_cleaned.info()

In [None]:
#check the balance of the binary columns again after removing outliers
for column in binary_cols:
    balance = data_scaled_cleaned[column].value_counts(normalize=True)
    print(f"Balance of {column} after cleaning:\n", balance)
    plt.figure(figsize=(8, 5))
    balance.plot(kind='bar')
    plt.title(f'Balance of {column} after cleaning')
    plt.xlabel(column)
    plt.ylabel('Proportion')
    plt.grid(axis='y', alpha=0.75)
    plt.show()

In [27]:
data_scaled_cleaned

Unnamed: 0,failed_traditional_credit,external_repayment_loans,credit_card_interest_incidence,default_flag,bnpl_usage_frequency,financial_stress_score,credit_limit_utilisation,payment_delinquency_count,impulsive_buying_score,financial_literacy_assessment,debt_accumulation_metric,return_dispute_incidents,demographic_risk_factor,bnpl_debt_ratio
0,0,0,0,0,1.546817,0.884585,-0.299154,-0.315186,1.193694,0.817999,-0.656225,0.391565,-0.044657,-1.725138
1,1,1,0,1,0.863694,1.594809,-1.632452,1.462180,0.160491,-1.536439,-0.249996,1.267548,-0.742423,-1.234515
2,1,0,0,0,1.319109,-0.535864,-0.642002,1.462180,-0.872712,0.145302,1.106396,1.267548,0.653109,-0.305834
3,0,0,0,0,-0.047135,-1.246089,-0.642002,0.869724,0.160491,-1.200091,1.436887,1.267548,1.350875,-1.532393
4,0,1,0,0,-1.185673,-1.246089,-0.946756,-0.315186,1.538095,-1.200091,0.397216,-0.484419,-1.440189,0.009567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0.635987,0.529472,0.653202,-0.315186,0.504892,0.145302,0.590003,0.391565,-0.044657,1.218604
996,1,1,0,0,1.091402,-0.180752,-0.946756,-1.500097,-0.183910,0.481651,-0.525406,-1.360402,-0.742423,-0.778935
997,0,0,0,0,1.546817,-0.180752,-1.099132,-0.907642,-1.217113,1.490696,0.534921,0.391565,1.350875,-1.129381
998,1,1,1,0,0.863694,-1.601201,-1.327698,-0.907642,-1.217113,-1.536439,-1.627044,-1.360402,-0.044657,-0.446012
