# **Debugging Notebook**

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns

## Preparing the data & Label generation

In [16]:
df = pd.read_csv("data/final-alpha-model-data.csv", index_col=0, parse_dates=True)

In [17]:
df = df.drop(columns=["close", "high", "low", "open"])

In [18]:
df.isnull().sum()

volume                               0
log_returns                          0
min_10k_count                        0
min_100_count                        0
new_non_zero_count                   0
count                                0
active_more_1y_percent               0
block_count                          0
hash_rate_mean                       0
inflation_rate                       0
revenue_from_fees                    0
balance_exchanges                    0
transfers_volume_to_exchanges_sum    0
net_realized_profit_loss             0
net_unrealized_profit_loss           0
realized_loss                        0
realized_profit                      0
sopr                                 0
loss_sum                             0
liveliness                           0
nvt                                  0
nvts                                 0
reserve_risk                         0
rhodl_ratio                          0
stock_to_flow_deflection             0
utxo_created_count       

In [19]:
print(df.describe())

              volume   log_returns  min_10k_count  min_100_count  \
count   26303.000000  26303.000000   26303.000000   26303.000000   
mean    16161.440763      0.000031      96.952895   16052.589438   
std     16285.187641      0.008099      10.231643     131.985041   
min        63.932000     -0.207136      79.000000   15681.000000   
25%      6743.777000     -0.002876      87.000000   15962.000000   
50%     11225.281000      0.000076      98.000000   16058.000000   
75%     19580.698000      0.003050     106.000000   16137.000000   
max    312462.168000      0.150751     121.000000   16684.000000   

       new_non_zero_count         count  active_more_1y_percent   block_count  \
count        26303.000000  2.630300e+04            26303.000000  26303.000000   
mean         18048.558301  8.384399e+08                0.599975      6.047903   
std           7319.726400  1.395178e+08                0.042823      2.507760   
min              0.000000  5.960800e+08                0.535026

In [20]:
Y = pd.DataFrame(list(np.where(df["log_returns"] >0, 1, -1)))

In [21]:
Y

Unnamed: 0,0
0,-1
1,1
2,-1
3,1
4,-1
...,...
26298,1
26299,1
26300,-1
26301,-1


## Check for homoegenous

In [24]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_feature_distributions(df, target_col='target'):
    features = [col for col in df.columns if col != target_col]
    
    for feature in features:
        class_0 = df[df[target_col] == -1][feature]
        class_1 = df[df[target_col] == 1][feature]

        # Skip features with no variance
        if class_0.nunique() <= 1 or class_1.nunique() <= 1:
            print(f"Skipping '{feature}' due to low variance in one of the classes.")
            continue

        plt.figure(figsize=(10, 5))
        sns.kdeplot(data=class_0, fill=True, label='Class -1 (e.g., Malignant)', alpha=0.5)
        sns.kdeplot(data=class_1, fill=True, label='Class 1 (e.g., Benign)', alpha=0.5)

        plt.title(f'Distribution of "{feature}" by Class')
        plt.xlabel(feature)
        plt.ylabel('Density')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()


In [25]:
plot_feature_distributions(df, target_col='target')

Skipping 'volume' due to low variance in one of the classes.
Skipping 'log_returns' due to low variance in one of the classes.
Skipping 'min_10k_count' due to low variance in one of the classes.
Skipping 'min_100_count' due to low variance in one of the classes.
Skipping 'new_non_zero_count' due to low variance in one of the classes.
Skipping 'count' due to low variance in one of the classes.
Skipping 'active_more_1y_percent' due to low variance in one of the classes.
Skipping 'block_count' due to low variance in one of the classes.
Skipping 'hash_rate_mean' due to low variance in one of the classes.
Skipping 'inflation_rate' due to low variance in one of the classes.
Skipping 'revenue_from_fees' due to low variance in one of the classes.
Skipping 'balance_exchanges' due to low variance in one of the classes.
Skipping 'transfers_volume_to_exchanges_sum' due to low variance in one of the classes.
Skipping 'net_realized_profit_loss' due to low variance in one of the classes.
Skipping 'ne