In [27]:
import sys
import os
import warnings

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.figure_factory as ff

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

sys.path.append(os.path.abspath('..'))
from credmodex.credlab import CredLab
import credmodex
from graphmodex import plotlymodex

from sklearn.linear_model import LogisticRegression
from credmodex.utils import plotly_main_layout

In [6]:
def get_structure(root_dir):
    structure = []
    for root, dirs, files in os.walk(root_dir):
        # Modify dirs in-place to skip __pycache__
        dirs[:] = [d for d in dirs 
                   if (d != '__pycache__')]

        level = root.replace(root_dir, '').count(os.sep)
        indent = ' ' * 4 * level
        structure.append(f'{indent}{os.path.basename(root)}/')

        subindent = ' ' * 4 * (level + 1)
        for f in files:
            structure.append(f'{subindent}{f}')
    return '\n'.join(structure)

# print(get_structure(r'C:\Users\gustavo.filho\Documents\Python\Modules\Credit Risk\credmodex'))

In [7]:
# df = {
#     "rating": list(range(1, 10)),
#     "target": [1, 0, 1, 1, 1, 0, 1, 0, 1,],
#     "score": [0.90, 0.10, 0.80, 0.70, 0.45, 0.35, 0.70, 0.20, 0.80],
# }
# df = pd.DataFrame(df)

# df = {
#     'rating': [0]*(95+309) + [1]*(187+224) + [2]*(549+299) + [3]*(1409+495) + [4]*(3743+690) + [5]*(4390+424) + [6]*(2008+94) + [7]*(593+8),
#     'target': [0]*95+[1]*309 + [0]*187+[1]*224 + [0]*549+[1]*299 + [0]*1409+[1]*495 + [0]*3743+[1]*690 + [0]*4390+[1]*424 + [0]*2008+[1]*94 + [0]*593+[1]*8,
#     'score': [309/(95+309)]*(95+309) + [224/(187+224)]*(187+224) + [299/(549+299)]*(549+299) + [495/(1409+495)]*(1409+495) + [690/(3743+690)]*(3743+690) + [424/(4390+424)]*(4390+424) + [94/(2008+94)]*(2008+94) + [8/(593+8)]*(593+8)
# }
# df = pd.DataFrame(df)

In [23]:
np.random.seed(42)
n_rows = 10000
n_cols = 20

dates = pd.date_range(start="2025-01-01", end="2025-05-30", periods=n_rows)

data = {"date": dates}

for i in range(3):
    data[f"cat_30_{i}"] = np.random.choice([f"{j}" for j in range(30)], size=n_rows)
for i in range(2):
    data[f"cat_5_{i}"] = np.random.choice([f"{j}" for j in ['a','b','c','d','e']], size=n_rows)

# Generate 5 integer columns (ordered, like credit history length, loan duration)
for i in range(5):
    data[f"int_{i}"] = np.random.randint(0, 100, size=n_rows)

# Generate 5 float columns (e.g. income, loan amount, utilization)
for i in range(5):
    data[f"float_{i}"] = np.random.normal(loc=5000, scale=2000, size=n_rows)

# Generate 2 columns with mostly missing values
data["mostly_nan_1"] = np.where(np.random.rand(n_rows) < 0.95, np.nan, np.random.rand(n_rows))
data["mostly_nan_2"] = np.where(np.random.rand(n_rows) < 0.9, np.nan, np.random.rand(n_rows))

# Generate 1 column with some special missing values (inf, -inf)
special_col = np.random.normal(loc=100, scale=50, size=n_rows)
special_col[np.random.choice(n_rows, size=10, replace=False)] = np.inf
special_col[np.random.choice(n_rows, size=10, replace=False)] = -np.inf
special_col[np.random.choice(n_rows, size=100, replace=False)] = np.nan
data["special_missing"] = special_col

# Fixing the target column to handle NaNs by using float type
target = np.random.choice([0, 1], size=n_rows).astype(float)
mask = np.random.rand(n_rows) < 0.2
target[mask] = np.nan
data["target"] = target

for i, col in enumerate([col for col in data if col.startswith("cat_")]):
    if (i == 3) or (i == 1): continue
    nan_indices = np.random.choice(n_rows, size=50, replace=False)
    temp_col = np.array(data[col], dtype=object)
    temp_col[nan_indices] = np.nan
    data[col] = temp_col

# Insert a few NaNs into the float columns
for i, col in enumerate([col for col in data if col.startswith("float_")]):
    if (i == 3): break
    nan_indices = np.random.choice(n_rows, size=30, replace=False)
    temp_col = data[col]
    temp_col[nan_indices] = np.nan
    data[col] = temp_col

# Convert to DataFrame
df = pd.DataFrame(data)