This file is used to generate class label counts from resampled dataset parquet files, before outputting class label counts was a part of the classical_sampling_techniques.ipynb

In [72]:
import pandas as pd
import os

In [73]:
metrics_directory = '../metrics'

In [74]:
# If there's no sampled_dataset_metrics.json, make a new one and store the unsampled dataset metrics
try: 
    df_label_counts = pd.read_json(path_or_buf=metrics_directory+'/sampling_label_counts.json', orient='index')
    
except FileNotFoundError:
    # schema:   Sampler | Label Classes | 0 | 1 | 2 | 3 | ... | 31 | 32 | 33
    df_label_counts = pd.DataFrame(columns=['Sampler', 'Label Classes'] + [str(i) for i in range(34)])

df_label_counts

Unnamed: 0,Sampler,Label Classes,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,,33+1,101,32822,177.0,155.0,8532.0,859.0,214682.0,13536.0,...,22351.0,26642.0,3941.0,2918.0,71.0,2422.0,137.0,32.0,1081.0,114.0
1,,7+1,32822,403,1011528.0,240282.0,78778.0,10433.0,14601.0,561.0,...,,,,,,,,,,
2,,1+1,1356586,32822,,,,,,,...,,,,,,,,,,
3,RandomOverSampler,33+1,214682,214682,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,...,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0
4,RandomOverSampler,7+1,1011528,1011528,1011528.0,1011528.0,1011528.0,1011528.0,1011528.0,1011528.0,...,,,,,,,,,,
5,RandomOverSampler,1+1,1356586,1356586,,,,,,,...,,,,,,,,,,
6,RandomUnderSampler,33+1,32,32,32.0,32.0,32.0,32.0,32.0,32.0,...,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
7,RandomUnderSampler,7+1,403,403,403.0,403.0,403.0,403.0,403.0,403.0,...,,,,,,,,,,
8,RandomUnderSampler,1+1,32822,32822,,,,,,,...,,,,,,,,,,
9,ClusterCentroids,1+1,32822,32822,,,,,,,...,,,,,,,,,,


In [75]:
resampled_filepaths = [filename for filename in os.listdir('./') if filename.endswith('.parquet')]

for filename in resampled_filepaths:
    sampling_method = filename.split('_')[0]
    label_class = filename.split('_')[1]
    
    df_resampled_dataset = pd.read_parquet(filename)
    
# Update dataframe
    row_index = df_label_counts.index[(df_label_counts['Sampler'] == sampling_method) & (df_label_counts['Label Classes'] == label_class)]
    row_index = row_index.tolist()
    
    value_counts = df_resampled_dataset['label'].value_counts()
    value_counts.sort_index(inplace=True)
    
    match len(row_index):
        case 0: # No previous record
            row_index = len(df_label_counts.index)
            df_label_counts.loc[row_index, 'Sampler'] = sampling_method
            df_label_counts.loc[row_index, 'Label Classes'] = label_class
                        
            for i in range(len(value_counts)):
                df_label_counts.loc[row_index, str(i)] = value_counts[i]
                
        case 1: # Update previous record
            for i in range(len(value_counts)):
                df_label_counts.loc[row_index, str(i)] = value_counts[i]
                
        case _:
            assert False, f'ERROR: {sampling_method} / {label_class} is duplicated. This should NOT happen.'

In [76]:
# Fix group classes from 8+1 to 7+1
map_classes = {'33+1': '33+1', '8+1': '7+1', '7+1': '7+1', '1+1': '1+1'}
df_label_counts['Label Classes'] = df_label_counts['Label Classes'].map(map_classes)
df_label_counts

Unnamed: 0,Sampler,Label Classes,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,,33+1,101,32822,177.0,155.0,8532.0,859.0,214682.0,13536.0,...,22351.0,26642.0,3941.0,2918.0,71.0,2422.0,137.0,32.0,1081.0,114.0
1,,7+1,32822,403,1011528.0,240282.0,78778.0,10433.0,14601.0,561.0,...,,,,,,,,,,
2,,1+1,1356586,32822,,,,,,,...,,,,,,,,,,
3,RandomOverSampler,33+1,214682,214682,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,...,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0
4,RandomOverSampler,7+1,1011528,1011528,1011528.0,1011528.0,1011528.0,1011528.0,1011528.0,1011528.0,...,,,,,,,,,,
5,RandomOverSampler,1+1,1356586,1356586,,,,,,,...,,,,,,,,,,
6,RandomUnderSampler,33+1,32,32,32.0,32.0,32.0,32.0,32.0,32.0,...,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
7,RandomUnderSampler,7+1,403,403,403.0,403.0,403.0,403.0,403.0,403.0,...,,,,,,,,,,
8,RandomUnderSampler,1+1,32822,32822,,,,,,,...,,,,,,,,,,
9,ClusterCentroids,1+1,32822,32822,,,,,,,...,,,,,,,,,,


In [77]:
# Update file
df_label_counts.to_json(path_or_buf=metrics_directory+'/sampling_label_counts.json', orient='index')

display(df_label_counts)

Unnamed: 0,Sampler,Label Classes,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,,33+1,101,32822,177.0,155.0,8532.0,859.0,214682.0,13536.0,...,22351.0,26642.0,3941.0,2918.0,71.0,2422.0,137.0,32.0,1081.0,114.0
1,,7+1,32822,403,1011528.0,240282.0,78778.0,10433.0,14601.0,561.0,...,,,,,,,,,,
2,,1+1,1356586,32822,,,,,,,...,,,,,,,,,,
3,RandomOverSampler,33+1,214682,214682,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,...,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0,214682.0
4,RandomOverSampler,7+1,1011528,1011528,1011528.0,1011528.0,1011528.0,1011528.0,1011528.0,1011528.0,...,,,,,,,,,,
5,RandomOverSampler,1+1,1356586,1356586,,,,,,,...,,,,,,,,,,
6,RandomUnderSampler,33+1,32,32,32.0,32.0,32.0,32.0,32.0,32.0,...,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
7,RandomUnderSampler,7+1,403,403,403.0,403.0,403.0,403.0,403.0,403.0,...,,,,,,,,,,
8,RandomUnderSampler,1+1,32822,32822,,,,,,,...,,,,,,,,,,
9,ClusterCentroids,1+1,32822,32822,,,,,,,...,,,,,,,,,,
