# Analysis of self distillation elements

## Importing

In [65]:
import pandas as pd
import numpy as np
from selfdist_toolkit.data_tools.loading import load_pure_data
from tqdm import tqdm

# Global variables

In [30]:
original_epochs = 100

# Loading of aid list data

In [31]:
aid_list = pd.read_csv("results/random_forest/experiments_check/chem-desc_good-aid_1.csv").aid.to_numpy().astype(int)

In [32]:
aid_list

array([    884,     891,     899,     914,    1418,    1431,    1770,
          1771,    1795,  493073,  493102,  493177,  493191,  493240,
        588834,  651741,  651812,  651814,  686978,  687022,  720691,
        743036,  743040,  743065, 1053173, 1259381, 1346982])

# Statistics of aid section

In [39]:
df = pd.read_csv("results/random_forest/experiments_check/chem-desc_good-aid_1.csv")[["aid", "mol_total", "mol_active", "mol_inactive"]]

In [40]:
df.head()

Unnamed: 0,aid,mol_total,mol_active,mol_inactive
34,884.0,9593,3274,6319
39,891.0,7457,1483,5974
45,899.0,7786,1782,6004
54,914.0,7340,216,7124
131,1418.0,1109,489,620


In [41]:
df["percentage_original"] = df.mol_active/df.mol_total

In [42]:
df.head()

Unnamed: 0,aid,mol_total,mol_active,mol_inactive,percentage_original
34,884.0,9593,3274,6319,0.341291
39,891.0,7457,1483,5974,0.198874
45,899.0,7786,1782,6004,0.228872
54,914.0,7340,216,7124,0.029428
131,1418.0,1109,489,620,0.440938


# Statistics total sd_elements

In [43]:
percentages_temp = []
for aid in df.aid:
    # define data path
    data_path = "results/teacher_exp/sd_out/teacher-pred_aid={}_epochs={}.csv".format(int(aid), original_epochs)
    
    # read csv
    df_sd = pd.read_csv(data_path)
    
    # get hard labels
    hard_labels = (df_sd.predicted_label_soft.to_numpy()>=0.5).astype(int)
    
    # write percentage into temp list
    percentages_temp.append(hard_labels.sum()/hard_labels.shape[0])

# add column to dataframe
df["percentage_total_sd"] = np.array(percentages_temp)

In [44]:
df.head()

Unnamed: 0,aid,mol_total,mol_active,mol_inactive,percentage_original,percentage_total_sd
34,884.0,9593,3274,6319,0.341291,0.487905
39,891.0,7457,1483,5974,0.198874,0.187066
45,899.0,7786,1782,6004,0.228872,0.328
54,914.0,7340,216,7124,0.029428,0.045882
131,1418.0,1109,489,620,0.440938,0.148281


# Statistics sd_elements_first

In [49]:
num_first_factor = 1.

In [51]:
percentages_temp = []
for aid in df.aid:
    # define data path
    data_path = "results/teacher_exp/sd_out/teacher-pred_aid={}_epochs={}.csv".format(int(aid), original_epochs)
    
    # read csv
    df_sd = pd.read_csv(data_path)
    
    # get hard labels
    hard_labels = (df_sd.predicted_label_soft.to_numpy()>=0.5).astype(int)
    
    # num data to choose
    num_choose = int(num_first_factor * df[df.aid==aid].iloc[0].mol_total)
    
    # get label selection
    selected_labels = np.argsort(np.abs(df_sd.predicted_label_soft.to_numpy()-0.5))[-num_choose:]
    
    # write percentage into temp list
    percentages_temp.append(hard_labels[selected_labels].sum()/selected_labels.shape[0])

# add column to dataframe
df["percentage_first_sd"] = np.array(percentages_temp)

In [52]:
df.head()

Unnamed: 0,aid,mol_total,mol_active,mol_inactive,percentage_original,percentage_total_sd,percentage_first_sd
34,884.0,9593,3274,6319,0.341291,0.487905,0.039508
39,891.0,7457,1483,5974,0.198874,0.187066,0.001877
45,899.0,7786,1782,6004,0.228872,0.328,0.080144
54,914.0,7340,216,7124,0.029428,0.045882,0.000136
131,1418.0,1109,489,620,0.440938,0.148281,0.086564


# Statistics sd_elements_second

In [54]:
num_first_factor = 1.

In [53]:
percentages_temp = []
for aid in df.aid:
    # define data path
    data_path = "results/teacher_exp/sd_out/teacher-pred_aid={}_epochs={}.csv".format(int(aid), original_epochs)
    
    # read csv
    df_sd = pd.read_csv(data_path)
    
    # get hard labels
    hard_labels = (df_sd.predicted_label_soft.to_numpy()>=0.5).astype(int)
    
    # sort the elements according to their score
    idx_sorted = np.argsort(df_sd.predicted_label_soft.to_numpy())

    # determine number of self distillation elements to fetch
    num_sd = int(num_first_factor * df[df.aid==aid].iloc[0].mol_total)
    pos_elem_count = int(num_sd * df[df.aid==aid].iloc[0].percentage_original)
    neg_elem_count = num_sd - pos_elem_count
    
    # select most secure positive and negative elements
    selected_labels = np.concatenate([idx_sorted[:neg_elem_count], idx_sorted[-pos_elem_count:]])
    
    # write percentage into temp list
    percentages_temp.append(hard_labels[selected_labels].sum()/selected_labels.shape[0])

# add column to dataframe
df["percentage_second_sd"] = np.array(percentages_temp)

In [55]:
df.head()

Unnamed: 0,aid,mol_total,mol_active,mol_inactive,percentage_original,percentage_total_sd,percentage_first_sd,percentage_second_sd
34,884.0,9593,3274,6319,0.341291,0.487905,0.039508,0.341291
39,891.0,7457,1483,5974,0.198874,0.187066,0.001877,0.198874
45,899.0,7786,1782,6004,0.228872,0.328,0.080144,0.228872
54,914.0,7340,216,7124,0.029428,0.045882,0.000136,0.029428
131,1418.0,1109,489,620,0.440938,0.148281,0.086564,0.440938


# Check if no overlapping in sd_data

In [66]:
for aid in tqdm(df.aid):
    # get smiles list of experiment
    orig_smiles = load_pure_data(int(aid)).smiles.to_numpy().astype(str)
    
    # get sd_smiles
    # define data path
    data_path = "results/teacher_exp/sd_out/teacher-pred_aid={}_epochs={}.csv".format(int(aid), original_epochs)
    
    # read csv
    sd_smiles = pd.read_csv(data_path).smiles.to_numpy().astype(str)
    
    # test if any of the sd_smiles is in orig_smiles
    assert np.in1d(orig_smiles, sd_smiles).sum() == 0

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [01:57<00:00,  4.36s/it]


# Further ideas:
- hard sd labels (maybe precision errors)
- balance training data to 50-50 class labels
- random sd data selection
- less self distillation data
- more self distillation data
- increase model parameters
- increase dropout
- increase epochs (although currently probably not the issue)
- concatenate output of layered ginconv instead od jk last (or if it does not work then abort)
- validation early stopping
- loss on second y axis scale