In [2]:
import pandas as pd
import numpy as np
from itertools import combinations
import logging
from math import sqrt, ceil
from PIL import Image

import random
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

from create_dataset import *
from MCAgent import MCAgent

In [3]:
def array_to_label(array):
    label = np.full(169, " ", dtype="S6")
    p = 0
    for i in range(18):
        for j in range(i+1, 18):
            if array[j] != 0:
                # print(label[p].dtype)
                label[p] = "X{}/X{}".format(i+1, j+1)
                p+=1
    label = label.reshape(13, 13)
    return label

array_to_label(np.ones(169)).reshape(-1)

array([b'X1/X2', b'X1/X3', b'X1/X4', b'X1/X5', b'X1/X6', b'X1/X7',
       b'X1/X8', b'X1/X9', b'X1/X10', b'X1/X11', b'X1/X12', b'X1/X13',
       b'X1/X14', b'X1/X15', b'X1/X16', b'X1/X17', b'X1/X18', b'X2/X3',
       b'X2/X4', b'X2/X5', b'X2/X6', b'X2/X7', b'X2/X8', b'X2/X9',
       b'X2/X10', b'X2/X11', b'X2/X12', b'X2/X13', b'X2/X14', b'X2/X15',
       b'X2/X16', b'X2/X17', b'X2/X18', b'X3/X4', b'X3/X5', b'X3/X6',
       b'X3/X7', b'X3/X8', b'X3/X9', b'X3/X10', b'X3/X11', b'X3/X12',
       b'X3/X13', b'X3/X14', b'X3/X15', b'X3/X16', b'X3/X17', b'X3/X18',
       b'X4/X5', b'X4/X6', b'X4/X7', b'X4/X8', b'X4/X9', b'X4/X10',
       b'X4/X11', b'X4/X12', b'X4/X13', b'X4/X14', b'X4/X15', b'X4/X16',
       b'X4/X17', b'X4/X18', b'X5/X6', b'X5/X7', b'X5/X8', b'X5/X9',
       b'X5/X10', b'X5/X11', b'X5/X12', b'X5/X13', b'X5/X14', b'X5/X15',
       b'X5/X16', b'X5/X17', b'X5/X18', b'X6/X7', b'X6/X8', b'X6/X9',
       b'X6/X10', b'X6/X11', b'X6/X12', b'X6/X13', b'X6/X14', b'X6/X15',
       b'X6

In [None]:
def df_processing(df):
    # Find the oldest year for each company
    oldest_years = df.groupby('company_name')['year'].min()
    
    # Filter companies that are alive in the oldest year
    df = pd.merge(df, oldest_years, on=['company_name', 'year' ], how='inner')
    df = df[(df['status_label'] == 1)]
    df = df.drop(columns=['company_name', 'status_label', 'year'])

    return df

In [None]:
# image data path
DATAPATH = r'american_bankruptcy.csv'

logging.basicConfig(level=logging.INFO)
df_raw = load_data(DATAPATH)
df = df_processing(df_raw)
# df.to_csv("Processed_df.csv")

In [None]:
def ratios_dataframe(df):
    # convert the dataframe to ratios
    ratios_df = pd.DataFrame()
    for column in df.columns:
        df[column] = df[column].replace(0, 1e-6)
    for i in range(18):
        for j in range(i+1, 18):
            column = "X{}/X{}".format(i+1, j+1)
            ratios_df[column] = df["X{}".format(i+1)] / df["X{}".format(j+1)]
            ratios_df[column] = (ratios_df[column] - ratios_df[column].mean()) / ratios_df[column].std() + 128
    # ratios_df['status_label'] = df['status_label']
    return ratios_df

# df = df.drop(columns=['status_label'])
ratios_df = ratios_dataframe(df_raw)
print(ratios_df.head())
# print(ratios_df['X1/X2'].describe())

In [None]:
labels = array_to_label(np.ones(169))
print(labels.shape)

mc_agent = MCAgent(ratios_df, labels)
new_labels, pixels = mc_agent.monte_carlo_simulation()


In [None]:
def enlarge_image(image_array, new_size=(64, 64)):
    """
    Enlarge an 13x13 image and a corresponding 13x13 dataframe to 64x64 using nearest neighbor method.

    Args:
    image_array (numpy.ndarray): An 13x13 numpy array representing the image.
    new_size (tuple): New size for the image and dataframe, default is (64, 64).

    Returns:
    numpy array: a numpy array of the enlarged image
    """
    if image_array.shape != (13, 13):
        raise ValueError("Input image array and dataframe must be 13x13 in size.")

    # Enlarge the image array
    image_pil = Image.fromarray(image_array)
    enlarged_image_pil = image_pil.resize(new_size, Image.NEAREST)
    enlarged_image_array = np.array(enlarged_image_pil)

    return enlarged_image_array

In [None]:
def rearrange_image(image, pixels):
    image = image.reshape(-1)
    new_image = np.zeros(image.size)
    for i, pixel in enumerate(pixels):
        new_image[i] = image[pixel]
    return new_image.reshape(13, 13)

In [None]:
# df = pd.read_csv("Processed_df.csv")
# Calculate the average based on the oldest year
df_mean = pd.DataFrame(df.mean()).T
data = df_mean.iloc[0].to_numpy()


# print(df_raw.head())
data_continue = df_raw[df_raw['status_label'] == 1].iloc[0].loc["X1":].to_numpy()
data_bankrupt= df_raw[df_raw['status_label'] == 0].iloc[1].loc["X1":].to_numpy()
# print(data_continue)
# print(data_bankrupt)

In [None]:
image_continue = array_to_image(data_continue)
image_continue_1 = enlarge_image(image_continue)
# fig1 = array_to_grayscale_image(enlarged_image)

image_bankrupt = array_to_image(data_bankrupt)
image_bankrupt_1 = enlarge_image(image_bankrupt)
# fig2 = array_to_grayscale_image(enlarged_image)

image_continue = rearrange_image(image_continue, pixels)
image_continue_2 = enlarge_image(image_continue)
# fig = array_to_grayscale_image(enlarged_image)

image_bankrupt = rearrange_image(image_bankrupt, pixels)
image_bankrupt_2 = enlarge_image(image_bankrupt)
print(image_bankrupt)
# fig = array_to_grayscale_image(enlarged_image)

# Convert arrays to grayscale images
image_continue_1 = plt.cm.gray(image_continue_1)
image_continue_2 = plt.cm.gray(image_continue_2)
image_bankrupt_1 = plt.cm.gray(image_bankrupt_1)
image_bankrupt_2 = plt.cm.gray(image_bankrupt_2)

# Create a (2, 2) subplot grid
fig, axs = plt.subplots(2, 2, figsize=(8, 8))

# Display the images in the subplots
axs[0, 0].imshow(image_continue_1)
axs[0, 0].set_title('Continue 1')

axs[1, 0].imshow(image_continue_2)
axs[1, 0].set_title('Continue 2')

axs[0, 1].imshow(image_bankrupt_1)
axs[0, 1].set_title('Bankrupt 1')

axs[1, 1].imshow(image_bankrupt_2)
axs[1, 1].set_title('Bankrupt 2')

# Adjust layout
plt.tight_layout()

test for modified create_dataset()

In [None]:
from create_dataset import create_dataset

DATAPATH = r'american_bankruptcy.csv'

bankrupt_data = load_data(DATAPATH)
dataset = create_dataset(bankrupt_data)

In [None]:
for i, (data, s) in enumerate(dataset[:5]):
    print("i:{}, status: {}".format(i, s))
    fig = array_to_grayscale_image(data.reshape(64, 64))
    plt.show()

### Oversample the dataset (as we have an extremely unbalanced dataset)

In [None]:
DATAPATH = r'american_bankruptcy.csv'
bankrupt_data = load_data(DATAPATH) # this load_data is the version before modification

In [None]:
from collections import Counter
from imblearn.over_sampling import SMOTE

X, y = bankrupt_data.drop(columns=['company_name', 'year', 'status_label']), bankrupt_data['status_label'] 
# print(X.head())
print("Original dataset shape: %s", Counter(y))

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)
print('Resampled dataset shape %s', Counter(y_res))

X_res['status_label'] = y_res
print(X_res['status_label'].describe())