In [51]:
import numpy as np
import pandas as pd
import utils
import seaborn as sn
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score, accuracy_score

# Load data

In [52]:
data_big_cleaned = pd.read_csv('data/big_cleaned_results.csv', index_col=0)
data_big_cleaned

Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,_OPN10,EXT,EST,AGR,CSN,OPN,MAX_SCORE,TYPE,CORR,B5_TYPE
0,4.0,1.0,5.0,2.0,5.0,1.0,5.0,2.0,4.0,1.0,...,5.0,46.0,36.0,39.0,32.0,45.0,46.0,0,1.000000,ESACO
1,3.0,5.0,3.0,4.0,3.0,3.0,2.0,5.0,1.0,5.0,...,3.0,20.0,35.0,40.0,37.0,35.0,40.0,1,1.000000,ISACO
2,2.0,3.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,5.0,...,4.0,25.0,34.0,40.0,34.0,41.0,41.0,1,0.984049,ISACO
3,2.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,1.0,4.0,...,3.0,26.0,31.0,38.0,25.0,39.0,39.0,96,0.972907,ESARO
4,3.0,3.0,3.0,3.0,5.0,3.0,3.0,5.0,3.0,4.0,...,5.0,29.0,41.0,42.0,48.0,48.0,48.0,1,0.985012,ESACO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415057,3.0,1.0,4.0,3.0,2.0,1.0,3.0,4.0,4.0,4.0,...,5.0,33.0,26.0,40.0,35.0,42.0,42.0,0,0.975817,ESACO
415058,4.0,2.0,5.0,2.0,4.0,2.0,5.0,4.0,2.0,4.0,...,4.0,36.0,34.0,41.0,24.0,42.0,42.0,0,0.983439,ESARO
415059,2.0,2.0,4.0,5.0,4.0,1.0,1.0,1.0,5.0,5.0,...,2.0,32.0,14.0,43.0,34.0,40.0,43.0,4,0.970462,ENACO
415060,3.0,1.0,5.0,1.0,4.0,1.0,4.0,2.0,5.0,2.0,...,5.0,44.0,24.0,40.0,45.0,49.0,49.0,4,0.984814,ENACO


# Choose X and Y data

In [53]:
X=data_big_cleaned.iloc[:,:50]
print(X)
Y=data_big_cleaned['TYPE']
print(Y)

        EXT1  EXT2  EXT3  EXT4  EXT5  EXT6  EXT7  EXT8  EXT9  EXT10  ...  \
0        4.0   1.0   5.0   2.0   5.0   1.0   5.0   2.0   4.0    1.0  ...   
1        3.0   5.0   3.0   4.0   3.0   3.0   2.0   5.0   1.0    5.0  ...   
2        2.0   3.0   4.0   4.0   3.0   2.0   1.0   3.0   2.0    5.0  ...   
3        2.0   2.0   2.0   3.0   4.0   2.0   2.0   4.0   1.0    4.0  ...   
4        3.0   3.0   3.0   3.0   5.0   3.0   3.0   5.0   3.0    4.0  ...   
...      ...   ...   ...   ...   ...   ...   ...   ...   ...    ...  ...   
415057   3.0   1.0   4.0   3.0   2.0   1.0   3.0   4.0   4.0    4.0  ...   
415058   4.0   2.0   5.0   2.0   4.0   2.0   5.0   4.0   2.0    4.0  ...   
415059   2.0   2.0   4.0   5.0   4.0   1.0   1.0   1.0   5.0    5.0  ...   
415060   3.0   1.0   5.0   1.0   4.0   1.0   4.0   2.0   5.0    2.0  ...   
415061   3.0   4.0   3.0   2.0   4.0   2.0   1.0   2.0   4.0    5.0  ...   

        OPN1  OPN2  OPN3  OPN4  OPN5  OPN6  OPN7  OPN8  OPN9  OPN10  
0        5.0   1.

# Check if Y data is representative

In [54]:
pd.Series(Y).value_counts()

0      143667
1      105328
3       30589
4       22700
25      14877
7       14634
27      14107
14      13665
23       9760
31       7874
20       7747
5        6874
96       6171
179      6066
11       5556
16       5447
Name: TYPE, dtype: int64

# Split data to train and test

In [55]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Check if test data is representative

In [56]:
print(pd.Series(y_train).value_counts())

0      114788
1       84369
3       24451
4       18159
25      11916
7       11715
27      11311
14      10929
23       7942
31       6229
20       6191
5        5458
96       4963
179      4870
11       4422
16       4336
Name: TYPE, dtype: int64


# Functions for mapping classes from SOM model to classes from Y data

In [57]:
def find_class(predictions, i, mapping):
    keys=pd.Series(y_test.tolist())[list(pd.Series(predictions).loc[lambda x : x == i].index)].value_counts().keys().to_list()
    cnts=pd.Series(y_test.tolist())[list(pd.Series(predictions).loc[lambda x : x == i].index)].value_counts().to_list()
    lbl = None
    cnt = None
    for key_id in range(len(keys)):
        if keys[key_id] not in mapping.best_classes.values:
            lbl = keys[key_id]
            cnt = cnts[key_id]
            mapping.at[mapping[mapping['som']==i].index[0], 'counts']=cnt
            mapping.at[mapping[mapping['som']==i].index[0], 'best_classes']=lbl 
            break
        elif cnts[key_id] > mapping[mapping['best_classes']==keys[key_id]].counts.values[0]:
            lbl = keys[key_id]
            cnt = cnts[key_id]
            j=mapping[mapping['best_classes']==keys[key_id]].som.values[0]
            index = mapping[mapping['best_classes']==keys[key_id]].index[0]
            mapping.at[index, 'counts']=None
            mapping.at[index, 'best_classes']=None
            mapping.at[mapping[mapping['som']==i].index[0], 'counts']=cnt
            mapping.at[mapping[mapping['som']==i].index[0], 'best_classes']=lbl 
            lbl2, cnt2 = find_class(predictions, j, mapping)
            mapping.at[index, 'counts']=cnt2
            mapping.at[index, 'best_classes']=lbl2
            break
        
            
    return lbl, cnt

In [58]:
def map_class(predictions):
    d = {'best_classes': pd.Series(pd.Series(predictions).unique()), 'som': pd.Series(pd.Series(predictions).unique())}
    classes_mapping = pd.DataFrame(data=d)
    classes_mapping['best_classes']=None
    classes_mapping['counts']=None
    i=0
    for i in pd.Series(predictions).unique():
        lbl, cnt = find_class(predictions, i, classes_mapping)
        i+=1

    best_classes=pd.Series(y_test.unique())
    null_classes=best_classes[~best_classes.isin(classes_mapping['best_classes'].values)].values

    i=0
    for index, row in classes_mapping[classes_mapping['best_classes'].isna()].iterrows():
        classes_mapping.at[index, 'best_classes']=null_classes[i]
        i+=1

    return classes_mapping

# sklearn-som

In [59]:
from sklearn_som.som import SOM
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from joblib import dump, load
from sklearn.preprocessing import StandardScaler

# Optimize hiperparameters

In [50]:
import optuna
def objective(trial):
    # 2. Suggest values of the hyperparameters using a trial object.
    param = {
        'm': 4,
        'n': 4,
        'dim': 50,
        'lr': trial.suggest_float('lr', 1e-2, 3, log=True),
        'sigma': trial.suggest_float('sigma', 1e-2, 3, log=True),
        'max_iter': trial.suggest_int('max_iter', 2000, 5000, log=True),
        'random_state': 42, #trial.suggest_int('random_state', 1, 42, log=True),
    }

    som = SOM(**param)
    som.fit(x_train.to_numpy(), epochs=10)
    predictions = som.fit_predict(x_test.to_numpy())
    predictions_fix = predictions.copy()
    for i in pd.Series(predictions).unique():
        key=pd.Series(y_test.tolist())[list(pd.Series(predictions).loc[lambda x : x == i].index)].value_counts().keys().to_list()[0]
        predictions_fix[predictions==i]=key
    score = f1_score(y_test, predictions_fix, average='macro')
    return score

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-07-27 00:47:15,935] A new study created in memory with name: no-name-8b588a24-9602-4c25-8023-aa172ed1bbef
[I 2023-07-27 00:47:30,443] Trial 0 finished with value: 0.03765235647776414 and parameters: {'lr': 0.14199514399787572, 'sigma': 0.02522823252666424, 'max_iter': 2883}. Best is trial 0 with value: 0.03765235647776414.
[I 2023-07-27 00:47:45,941] Trial 1 finished with value: 0.14843755860026428 and parameters: {'lr': 0.9063036142272856, 'sigma': 1.084431839370771, 'max_iter': 3610}. Best is trial 1 with value: 0.14843755860026428.
[I 2023-07-27 00:48:01,768] Trial 2 finished with value: 0.15744156685697783 and parameters: {'lr': 0.5815942826212414, 'sigma': 1.4424002186386138, 'max_iter': 4598}. Best is trial 2 with value: 0.15744156685697783.
[I 2023-07-27 00:48:17,087] Trial 3 finished with value: 0.03743877496269648 and parameters: {'lr': 0.03027143393048963, 'sigma': 0.16217105055969902, 'max_iter': 3548}. Best is trial 2 with value: 0.15744156685697783.
[I 2023-07-27 0

[I 2023-07-27 00:58:16,802] Trial 38 finished with value: 0.1730623474275726 and parameters: {'lr': 1.4120891869398944, 'sigma': 0.4758704468084834, 'max_iter': 4129}. Best is trial 9 with value: 0.2069372096508254.
[I 2023-07-27 00:58:32,616] Trial 39 finished with value: 0.03794653909431091 and parameters: {'lr': 0.8562702267760047, 'sigma': 0.3054328725260602, 'max_iter': 4347}. Best is trial 9 with value: 0.2069372096508254.
[I 2023-07-27 00:58:48,383] Trial 40 finished with value: 0.19598601635636487 and parameters: {'lr': 1.132895866579797, 'sigma': 0.4210070101465585, 'max_iter': 3841}. Best is trial 9 with value: 0.2069372096508254.
[I 2023-07-27 00:59:04,612] Trial 41 finished with value: 0.2099287253346384 and parameters: {'lr': 0.6745299772124765, 'sigma': 0.6214853550016058, 'max_iter': 3503}. Best is trial 41 with value: 0.2099287253346384.
[I 2023-07-27 00:59:20,602] Trial 42 finished with value: 0.14769825322549898 and parameters: {'lr': 1.5669166519415199, 'sigma': 0.43

[I 2023-07-27 01:09:01,717] Trial 76 finished with value: 0.17342815207177756 and parameters: {'lr': 0.31505447132309244, 'sigma': 0.5488116939524788, 'max_iter': 3717}. Best is trial 41 with value: 0.2099287253346384.
[I 2023-07-27 01:09:19,329] Trial 77 finished with value: 0.16934952178850654 and parameters: {'lr': 0.22021534861672573, 'sigma': 0.6998713792136044, 'max_iter': 3276}. Best is trial 41 with value: 0.2099287253346384.
[I 2023-07-27 01:09:36,581] Trial 78 finished with value: 0.15288685122178966 and parameters: {'lr': 0.5869685939588623, 'sigma': 1.361331135532909, 'max_iter': 3813}. Best is trial 41 with value: 0.2099287253346384.
[I 2023-07-27 01:09:54,883] Trial 79 finished with value: 0.19340709609685322 and parameters: {'lr': 0.7979312023146394, 'sigma': 0.8655675784338146, 'max_iter': 3525}. Best is trial 41 with value: 0.2099287253346384.
[I 2023-07-27 01:10:21,656] Trial 80 finished with value: 0.03807996329461315 and parameters: {'lr': 0.1850000618874208, 'sigma

# Train model

In [68]:
# som = load('som_big.joblib')
# {'lr': 0.20715800685544858, 'sigma': 0.48812890935056635, 'max_iter': 3510, 'random_state': 23}
# {'lr': 0.4484606809403018, 'sigma': 0.7617226203270118, 'max_iter': 3781, 'random_state': 5}
# {'lr': 1.5792491494219374, 'sigma': 0.6821505133687678, 'max_iter': 3575}
# {'lr': 1.1441603825302205, 'sigma': 0.3672077435443995, 'max_iter': 2409}
# {'lr': 0.4517516977630655, 'sigma': 0.6820945909305205, 'max_iter': 3153}
som = SOM(m=4, n=4, dim=50, lr=0.4517516977630655, sigma=0.6820945909305205, max_iter=3153, random_state=42)
%time som.fit(x_train.to_numpy(), epochs=10)
predictions = som.fit_predict(x_test.to_numpy())
# dump(som, 'som_big.joblib') 

CPU times: user 7.72 s, sys: 0 ns, total: 7.72 s
Wall time: 8.36 s


In [69]:
pd.Series(predictions).value_counts()

10    6769
14    6358
8     6095
3     5796
6     5642
13    5339
0     5304
11    5226
15    5193
12    4832
5     4651
7     4534
4     4506
9     4441
1     4300
2     4027
dtype: int64

# Map classes from som to classes from Y data

In [113]:
classes_mapping=map_class(predictions)
predictions_fix = predictions.copy()
for index, row in classes_mapping.iterrows():
    predictions_fix[predictions==classes_mapping.at[index, 'som']]=classes_mapping.at[index, 'best_classes']

# Simple method

In [108]:
# predictions_fix = predictions.copy()
# for i in pd.Series(predictions).unique():
#     key=pd.Series(y_test.tolist())[list(pd.Series(predictions).loc[lambda x : x == i].index)].value_counts().keys().to_list()[0]
#     predictions_fix[predictions==i]=key

# Check the resuts

In [114]:
print(classification_report(y_test,predictions_fix))

              precision    recall  f1-score   support

           0       0.86      0.17      0.29     28879
           1       0.86      0.19      0.31     20959
           3       0.66      0.69      0.68      6138
           4       0.58      0.67      0.62      4541
           5       0.00      0.00      0.00      1416
           7       0.19      0.32      0.24      2919
          11       0.18      0.85      0.30      1134
          14       0.00      0.00      0.00      2736
          16       0.02      0.10      0.03      1111
          20       0.10      0.39      0.16      1556
          23       0.01      0.02      0.01      1818
          25       0.31      0.56      0.40      2961
          27       0.29      0.54      0.38      2796
          31       0.15      0.61      0.24      1645
          96       0.07      0.26      0.11      1208
         179       0.06      0.21      0.09      1196

    accuracy                           0.28     83013
   macro avg       0.27   

In [115]:
score = f1_score(y_test, predictions_fix, average='weighted')
score

0.3120490690007987

In [116]:
accuracy_score(y_test, predictions_fix)

0.2832447929842314

In [117]:
f1_score(y_test, predictions_fix, average='macro')

0.24014467440983844

# Somoclu

In [83]:
import somoclu
from mpl_toolkits.mplot3d import Axes3D


# Optimize hiperparameters

In [84]:
def objective2(trial):
    # 2. Suggest values of the hyperparameters using a trial object.
    param = {
        'n_columns': 4,
        'n_rows': 4,
        'initialcodebook': None,
        'kerneltype': 0, 
        'maptype': "planar",
        'gridtype': trial.suggest_categorical("gridtype", ["rectangular", "hexagonal"]), 
        'compactsupport': True, 
        'neighborhood': "gaussian",
        'std_coeff': trial.suggest_float('std_coeff', 1e-2, 3, log=True),
        'initialization': trial.suggest_categorical("initialization", ["random", "pca", None]),  
        'verbose': 0, # trial.suggest_categorical("verbose", [0, 1, 2]),  
    }
    
    def get_labels(som2):
        grid_labels = [tuple(grid_label) for grid_label in som2.bmus]
        unique_labels = [
            tuple(grid_label) for grid_label in np.unique(grid_labels, axis=0)
        ]
        # Generate mapping
        labels_mapping = {
            grid_label: cluster_label
            for grid_label, cluster_label in zip(
                unique_labels, range(len(unique_labels))
            )
        }
        labels = np.array(
            [labels_mapping[grid_label] for grid_label in grid_labels]
        )
        predictions2=pd.Series(labels)
        return predictions2

    som2 = somoclu.Somoclu(**param)
    som2.train(np.float32(x_train.to_numpy()), epochs=10)
    som2.train(np.float32(x_test.to_numpy()), epochs=10)
    predictions2 = get_labels(som2)
    classes_mapping2=map_class(predictions2)
    predictions_fix2 = predictions2.copy()
    for index, row in classes_mapping2.iterrows():
        predictions_fix2[predictions2==classes_mapping2.at[index, 'som']]=classes_mapping2.at[index, 'best_classes']
    score = f1_score(y_test, predictions_fix2, average='macro')
    return score

# 3. Create a study object and optimize the objective function.
study2 = optuna.create_study(direction='maximize')
study2.optimize(objective2, n_trials=100)

[I 2023-07-27 01:31:01,141] A new study created in memory with name: no-name-a6d22e35-06d5-4d7d-8ce6-4f1724770a43
[I 2023-07-27 01:31:05,900] Trial 0 finished with value: 0.21006933994746213 and parameters: {'gridtype': 'rectangular', 'std_coeff': 0.01517310653130157, 'initialization': 'pca'}. Best is trial 0 with value: 0.21006933994746213.
[I 2023-07-27 01:31:09,510] Trial 1 finished with value: 0.2506232198413363 and parameters: {'gridtype': 'rectangular', 'std_coeff': 0.2671509618739662, 'initialization': 'random'}. Best is trial 1 with value: 0.2506232198413363.
[I 2023-07-27 01:31:13,565] Trial 2 finished with value: 0.25639560058928396 and parameters: {'gridtype': 'hexagonal', 'std_coeff': 0.310183498671109, 'initialization': 'pca'}. Best is trial 2 with value: 0.25639560058928396.
[I 2023-07-27 01:31:17,337] Trial 3 finished with value: 0.17802527058642323 and parameters: {'gridtype': 'rectangular', 'std_coeff': 1.372401552964429, 'initialization': 'random'}. Best is trial 2 wi

[I 2023-07-27 01:34:33,215] Trial 36 finished with value: 0.2360868183555332 and parameters: {'gridtype': 'hexagonal', 'std_coeff': 0.3205660656231137, 'initialization': 'random'}. Best is trial 24 with value: 0.2878121799258846.
[I 2023-07-27 01:34:38,924] Trial 37 finished with value: 0.2586471759612799 and parameters: {'gridtype': 'rectangular', 'std_coeff': 0.22315286851330554, 'initialization': 'random'}. Best is trial 24 with value: 0.2878121799258846.
[I 2023-07-27 01:34:44,554] Trial 38 finished with value: 0.23682165130439736 and parameters: {'gridtype': 'hexagonal', 'std_coeff': 0.3855407201427996, 'initialization': 'random'}. Best is trial 24 with value: 0.2878121799258846.
[I 2023-07-27 01:34:50,312] Trial 39 finished with value: 0.25115609698728514 and parameters: {'gridtype': 'rectangular', 'std_coeff': 0.2864057777232352, 'initialization': None}. Best is trial 24 with value: 0.2878121799258846.
[I 2023-07-27 01:34:56,501] Trial 40 finished with value: 0.21365716947678592

[I 2023-07-27 01:37:58,154] Trial 72 finished with value: 0.2146771856446098 and parameters: {'gridtype': 'hexagonal', 'std_coeff': 0.18579784770651536, 'initialization': 'pca'}. Best is trial 24 with value: 0.2878121799258846.
[I 2023-07-27 01:38:05,394] Trial 73 finished with value: 0.2473390792079113 and parameters: {'gridtype': 'hexagonal', 'std_coeff': 0.25155640312987043, 'initialization': 'pca'}. Best is trial 24 with value: 0.2878121799258846.
[I 2023-07-27 01:38:12,416] Trial 74 finished with value: 0.2602489358704115 and parameters: {'gridtype': 'hexagonal', 'std_coeff': 0.33253662723603583, 'initialization': 'pca'}. Best is trial 24 with value: 0.2878121799258846.
[I 2023-07-27 01:38:18,174] Trial 75 finished with value: 0.24956570963484193 and parameters: {'gridtype': 'hexagonal', 'std_coeff': 0.24380904643136242, 'initialization': 'pca'}. Best is trial 24 with value: 0.2878121799258846.
[I 2023-07-27 01:38:24,278] Trial 76 finished with value: 0.21368554371715198 and param

# Train model

In [85]:
# som2 = load('som2_big.joblib')
# {'gridtype': 'rectangular', 'std_coeff': 0.012281063564056477, 'initialization': None, 'verbose': 2}
# {'gridtype': 'rectangular', 'std_coeff': 0.3928668000921264, 'initialization': 'pca'}
# {'gridtype': 'rectangular', 'std_coeff': 0.3241248621288222, 'initialization': 'pca'}
# {'gridtype': 'hexagonal', 'std_coeff': 0.17366435985004622, 'initialization': 'random'}
som2 = somoclu.Somoclu(4, 4, initialcodebook=None, kerneltype=0, maptype="planar", gridtype='hexagonal', std_coeff=0.17366435985004622, initialization='random', verbose=0)
%time som2.train(np.float32(x_train.to_numpy()), epochs=10)
# dump(som, 'som2_big.joblib') 

CPU times: user 14.5 s, sys: 130 ms, total: 14.6 s
Wall time: 2.93 s


# Predict

In [86]:
som2.train(np.float32(x_test.to_numpy()), epochs=10)

# Get labels (from som-learn)

In [87]:
def get_labels(som2):
    grid_labels = [tuple(grid_label) for grid_label in som2.bmus]
    unique_labels = [
        tuple(grid_label) for grid_label in np.unique(grid_labels, axis=0)
    ]
    # Generate mapping
    labels_mapping = {
        grid_label: cluster_label
        for grid_label, cluster_label in zip(
            unique_labels, range(len(unique_labels))
        )
    }
    labels = np.array(
        [labels_mapping[grid_label] for grid_label in grid_labels]
    )
    predictions2=pd.Series(labels)
    return predictions2

In [88]:
predictions2 = get_labels(som2)

In [89]:
predictions2.value_counts()

3     6682
8     6268
6     6048
0     5867
10    5742
13    5703
9     5659
1     5511
5     5334
4     4939
14    4885
15    4828
2     4176
7     4153
11    4117
12    3101
dtype: int64

# Map classes from som to classes from Y data

In [102]:
classes_mapping2=map_class(predictions2)
predictions_fix2 = predictions2.copy()
for index, row in classes_mapping2.iterrows():
    predictions_fix2[predictions2==classes_mapping2.at[index, 'som']]=classes_mapping2.at[index, 'best_classes']

# Simple method

In [103]:
# predictions_fix2 = predictions.copy()
# for i in pd.Series(predictions2).unique():
#     key=pd.Series(y_test.tolist())[list(pd.Series(predictions2).loc[lambda x : x == i].index)].value_counts().keys().to_list()[0]
#     predictions_fix2[predictions2==i]=key

# Check the results

In [104]:
print(classification_report(y_test,predictions_fix2))

              precision    recall  f1-score   support

           0       0.95      0.18      0.31     28879
           1       0.94      0.26      0.40     20959
           3       0.39      0.31      0.35      6138
           4       0.25      0.34      0.29      4541
           5       0.15      0.43      0.22      1416
           7       0.40      0.57      0.47      2919
          11       0.18      0.84      0.29      1134
          14       0.02      0.04      0.03      2736
          16       0.01      0.05      0.02      1111
          20       0.16      0.50      0.24      1556
          23       0.00      0.00      0.00      1818
          25       0.36      0.59      0.45      2961
          27       0.22      0.49      0.30      2796
          31       0.14      0.47      0.21      1645
          96       0.01      0.04      0.02      1208
         179       0.05      0.22      0.08      1196

    accuracy                           0.27     83013
   macro avg       0.26   

In [105]:
score = f1_score(y_test, predictions_fix2, average='weighted')
score

0.3112741554071716

In [106]:
accuracy_score(y_test, predictions_fix2)

0.27048775492995075

In [107]:
f1_score(y_test, predictions_fix2, average='macro')

0.22967388746564915