## Handling Class Imbalance 

- Using Neural Graphical Models
- Binary Classification

In [2]:
import os, sys
# reloads modules automatically before entering the 
# execution of code typed at the IPython prompt.
%load_ext autoreload
%autoreload 2
print(sys.prefix)
# import warnings
# warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd

# Local imports
from ngm.utils import data_processing as dp
from ngm.utils.metrics import reportMetrics
import ngm.main as ngm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/anaconda/envs/kals


### Prepare data for `NGM`
- Data: UCI repository

In [25]:
# Load the data 
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
uci_data = fetch_ucirepo(id=17)
print(uci_data.metadata) 
features = uci_data.data.features 
target = uci_data.data.targets 

# Create the (Xy, G) pair
Xy = pd.concat(
    [features, pd.get_dummies(target).astype(int).iloc[:, 0]],
    axis=1
)

# Create a fully connected graph
G = dp.complete_graph_from_list(Xy.columns)

# Create a bi-partite graph in case of multiple labels. 

print(Xy, G)

{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'published_in': 'Electronic imaging', 'year': 1993, 'url': 'https://www.semanticscholar.org/paper/53

## Learning `NGM`

### TODO
- Report 5-fold CV results. 
- Set the target value to 0.5 while giving as an input

In [58]:
curr_GPU = 'cuda:0'  # TODO: implement the use_device code

In [92]:
import ngm.main as ngm

# Learning the NMG model
model_NGM = ngm.learning(
    G, Xy, lambd=1e0,
    hidden_dim=200,
    epochs=2000, 
    lr=0.01,
    norm_type='min_max',
    k_fold=1,
    structure_penalty='hadamard'
) 

Using "cuda" compute
Means of selected features (radius1                14.127292
texture1               19.289649
perimeter1             91.969033
area1                 654.889104
smoothness1             0.096360
compactness1            0.104341
concavity1              0.088799
concave_points1         0.048919
symmetry1               0.181162
fractal_dimension1      0.062798
radius2                 0.405172
texture2                1.216853
perimeter2              2.866059
area2                  40.337079
smoothness2             0.007041
compactness2            0.025478
concavity2              0.031894
concave_points2         0.011796
symmetry2               0.020542
fractal_dimension2      0.003795
radius3                16.269190
texture3               25.677223
perimeter3            107.261213
area3                 880.583128
smoothness3             0.132369
compactness3            0.254265
concavity3              0.272188
concave_points3         0.114606
symmetry3               0.2

In [98]:
target_feature = 'Diagnosis_B'
Xy_test = Xy.copy()
Xy_test[target_feature] = 0.5
pred_Xy = ngm.fit_regression_direct(
    model_NGM, 
    Xy_test,
    target_feature,
    BATCH_SIZE=1000,
    VERBOSE=True, 
    USE_CUDA=True
)

Using "cuda" compute
torch.Size([569, 31])


In [99]:
pred_Xy[target_feature]

0      0.000392
1      0.019045
2      0.003468
3      0.028756
4      0.035871
         ...   
564    0.000535
565    0.010043
566    0.357706
567    0.000164
568    0.982753
Name: Diagnosis_B, Length: 569, dtype: float64

In [95]:
Xy_test[target_feature].value_counts()

Diagnosis_B
0.5    569
Name: count, dtype: int64

In [96]:
# Compute metrics
from ngm.utils import metrics 

print(metrics.compute_metrics(pred_Xy[target_feature], Xy[target_feature]))
print(f'(AUC, AUPR) =  {metrics.get_auc(Xy[target_feature], pred_Xy[target_feature])}')

{'mae': 0.13375298933205076, 'rmse': 0.21857334672349857, 'r2': 0.7956310752690731, 'rel_err': 270954957236448.22}
max acc = 0.9648506151142355
(AUC, AUPR) =  (0.9762433275196871, 0.9760829664937986)


In [97]:
# Save the local and global models
import pickle 

# folder = '../../../externalData/vichar_models/'
# filename = 'mnist_NGM_H500L10.pickle'

# with open(folder+filename, 'wb') as handle: # file object
#     pickle.dump([model_NGM], handle, protocol=pickle.HIGHEST_PROTOCOL)