In [2]:
import numpy as np
import pandas as pd
import os

RANDSEED = 0

root_folder = os.getcwd()
data_folder = os.path.join(root_folder, "datasets")

dnames = os.listdir(data_folder)
testing_dnames = dnames[:2]

N_TRAIN = 20

N_FOLDS = 2
testing_dnames

['original-data', 'processed-data']

In [57]:
for data in testing_dnames:

    data_file = os.path.join(data_folder, data) #so far correct: reaches the single dataset folder with all the .csv files
    
    for i in range(N_FOLDS):
        X_train = pd.read_csv(os.path.join(data_file, "X_new_train"+str(i+1)+".csv"))
        y_train = pd.read_csv(os.path.join(data_file, "y_new_train"+str(i+1)+".csv"))
        X_test = pd.read_csv(os.path.join(data_file, "X_new_test"+str(i+1)+".csv"))
        y_test = pd.read_csv(os.path.join(data_file, "y_new_test"+str(i+1)+".csv"))

        if y_train.shape[1] == 2: #survival case
            
            y_train[y_train.columns[-2]] = y_train[y_train.columns[-2]].astype('bool') # needed for correct recarray
            y_test[y_test.columns[-2]] = y_test[y_test.columns[-2]].astype('bool') # needed for correct recarray
            y_train = y_train.to_records(index=False) #builds the structured array, needed for RSF
            y_test = y_test.to_records(index=False) #builds the structured array, needed for RSF


# Prepare to (partially mask) all train instances except for the first N_TRAIN

from sklearn.model_selection import train_test_split

X_train, X_pool, y_train, y_pool = train_test_split(X_train, y_train, train_size=N_TRAIN, random_state=0)
print(X_train.index)



Int64Index([37, 81, 46, 39, 65, 58, 12, 92, 88, 70, 87, 36, 21, 83, 9, 100, 67,
            64, 47, 44],
           dtype='int64')


In [58]:

def additional_censoring(y_pool, amount_masking=1, intensity=1, random_state=None):

    ''' 
    - y_pool: np.recarray, it's the pool dataset for active learning querying, part of it
        ( or all of it) might be masked ( partially or fully masked)
    - amount_masking:
        - if float, or == 1 -> probability of masking instances from y_pool. amount_masking=1 -> all instances are masked
        - if int > 1 -> total quantity of masked elements in the dataframe, has to be less than np.shape(y_pool)[0]  

    - intensity: amount of the 'information loss' due to masking
            - if (float, int) -> information loss is contant acorss masked instances, given an instance
            with observed values (T_i, \delta) the new label will be (T_i*(1-intensity), 0)
            - if (list, tuple) -> needs to be of the form (e1, e2), length 2. The information loss 
            in this case is not constant but we have intensity ~ Uniform(e1, e2) instead.  

    -random_state: for deterministic generation, good for debugging
    '''
    
    import random
    import copy
    random.seed(a=random_state) #overrides random seed within random library
    np.random.seed(random_state) #overrides random seed in numpy library

    assert isinstance(y_pool, np.recarray)

    N = y_pool.shape[0]

    if amount_masking <= 1 and amount_masking > 0:
        mask_vector = np.random.rand(N, ) < amount_masking #>= amount_masking #False if still visible, True if masked
    elif isinstance(amount_masking, int) and amount_masking > 1 and amount_masking <= N:
        mask_vector = np.array([i in random.sample(range(N), amount_masking) for i in range(N)])
    else:
        raise KeyError("amount_masking is not a float or int in the expected range")

    # checking validity of intensity input. 
    # TODO: distributons with parameters should be accepted as inputs
    if isinstance(intensity, (float, int)):
        intens_censoring = np.ones(N)*intensity # censor everything (with fixed intensity)
    
    elif isinstance(intensity, (tuple, list)):
        if len(intensity) != 2:
            raise KeyError("Intensity variable is a tuple/list of \
                           length: {:d}, expected 2 instead".format(len(intensity)))
        else:
            intens_censoring = np.random.uniform(intensity[0], intensity[1], N) #vector with entries in [0, intensity)

    # censor_intervals = intens_censoring*y_pool["Survival"] #not robust to name choice, TODO: improve 

    y_pool2 = copy.copy(y_pool) 
    y_pool2["Survival"] = y_pool["Survival"]*(1-intens_censoring*(mask_vector))
    y_pool2["Status"] = y_pool["Status"]*(1-mask_vector) # sets to False all masked observations

    print(y_pool)
    print("="*50)
    print(y_pool2)
    print(mask_vector)

    return y_pool2, intens_censoring


y_pool2, intens_censoring = additional_censoring(y_pool, amount_masking=1,
                                                 intensity=1,
                                                 random_state=0)

[(False,  3.        ) ( True, 27.        ) (False,  4.        )
 ( True, 41.        ) ( True, 26.        ) (False,  0.25      )
 (False,  0.5       ) ( True, 25.        ) ( True, 34.        )
 (False,  0.75      ) ( True, 33.        ) (False, 15.        )
 ( True, 34.        ) ( True, 36.        ) ( True, 57.        )
 ( True, 49.        ) ( True, 31.        ) ( True, 25.        )
 ( True, 23.        ) ( True, 12.        ) ( True, 17.        )
 (False,  0.75      ) (False,  7.        ) ( True, 33.        )
 (False,  0.5       ) ( True, 25.        ) (False, 40.        )
 (False,  2.        ) ( True, 29.        ) (False, 10.        )
 (False,  0.5       ) (False,  2.96475681) ( True, 12.        )
 ( True, 46.        ) (False, 20.        ) ( True, 53.        )
 ( True, 34.        ) ( True, 38.        ) ( True, 15.        )
 ( True, 33.        ) (False,  0.75      ) ( True, 36.        )
 ( True, 16.        ) ( True, 31.        ) (False, 21.        )
 ( True, 21.        ) ( True, 41.       

In [86]:
import random
random.seed(a=None)
print(X_pool[:3])
print(y_pool[:3])
print("="*30)

query_index = random.choice(list(range(X_pool[:3].shape[0])))
print(X_pool.iloc[query_index])

idx = X_pool.iloc[query_index].name
print(idx)
print(y_pool[query_index])


          age  pericardialeffusion  fractionalshortening  epss      lvdd  \
26  64.063229                  0.0              0.203306  12.0  4.937163   
61  62.000000                  0.0              0.220000  12.1  3.920000   
2   58.000000                  0.0              0.170000  28.9  6.730000   

    wallmotion-score  wallmotion-index   mult     group  
26              6.00              3.00  0.140  2.000000  
61             11.00              1.00  0.785  1.783385  
2              26.08              2.01  0.928  2.000000  
[(False,  3.) ( True, 27.) (False,  4.)]
age                     62.000000
pericardialeffusion      0.000000
fractionalshortening     0.220000
epss                    12.100000
lvdd                     3.920000
wallmotion-score        11.000000
wallmotion-index         1.000000
mult                     0.785000
group                    1.783385
Name: 61, dtype: float64
61
(True, 27.)


## Procedure:

`X_train` starts with $N$ (N=10?) samples, and new samples from `X_pool` will be queried one at the time.
The model is retrained at each iteration.
Every $M$ (M=5?) samples, we report performance and learning curve.

### Initial set-up:

Labels in the `X_pool` are fully censored and no information is available to the model in the beginning.
Once the model is queried, the originally masked label $(T, \delta)$ will be revealed and added to the `X_train` batch.

In [33]:
# X_drop = X_train.loc[1,:]
# print(X_train.head(5))
# # print(X_drop.name)
# # print(X_drop.loc[1])
# # print(X_train.head(5))
# X_train.drop([X_drop.name], axis=0, inplace=True)
# print(X_train.head)


    age  pericardialeffusion  fractionalshortening       epss      lvdd  \
0  66.0                  0.0                  0.29  15.600000  6.150000   
1  70.0                  1.0                  0.27   4.700000  4.490000   
2  58.0                  0.0                  0.17  28.900000  6.730000   
3  61.0                  1.0                  0.27  12.077119  4.850156   
4  78.0                  0.0                  0.06  16.100000  5.620000   

   wallmotion-score  wallmotion-index   mult  group  
0             14.00             1.000  1.000    2.0  
1             22.00             2.000  0.786    2.0  
2             26.08             2.010  0.928    2.0  
3              9.00             1.500  0.428    2.0  
4             13.67             1.367  0.714    2.0  
<bound method NDFrame.head of           age  pericardialeffusion  fractionalshortening       epss      lvdd  \
0   66.000000                  0.0              0.290000  15.600000  6.150000   
2   58.000000                  0.

In [98]:
# Model = 

from typing import Optional, Union
import random

class SamplingQuery:

    RANDSEED = 0
    import random

    def __init__(self,
                 X_train: pd.DataFrame,
                 y_train: np.ndarray | np.recarray,
                 X_pool: pd.DataFrame,
                 y_pool: np.ndarray | np.recarray,
                 ):
        self.X_train = X_train
        self.y_train = y_train
        self.X_pool = X_pool
        self.y_pool = y_pool

    def update_train_pool(self, X_queried: pd.DataFrame|pd.Series,
                                y_queried: np.ndarray|np.recarray):
        ''' given the selcted instance (X_queried, y_queried), drop it 
        from (X_pool, y_pool) and append it to (X_train, y_train) '''

        self.X_train = pd.concat([self.X_train, X_queried])
        print(self.y_train.shape)
        print(y_queried.shape)
        self.y_train = np.concatenate([self.y_train, np.array(y_queried)])
        
        # X_queried is now a pd.Series whose 'name' field (should) correspond to the original df index
        self.X_pool = self.X_pool.drop([X_queried.name], axis=0)
        self.y_pool = np.delete(self.y_pool, [y_queried.name], axis=0), 
        # self.y_pool = self.y_pool.drop([y_queried.name], axis=0)

        return print("updated")
    
    def random_sampling_query(self, random_state:Optional[int]=None) -> tuple[pd.DataFrame, pd.Series]:
        random.seed(a=RANDSEED) #overrides random seed within random library
    
        #select random row, output as single raw df or index only or...
        
        # (random) select index based on position (iloc style)
        query_list = list(range(self.X_pool.shape[0]))
        query_index = random.choice(query_list)
    
        return self.X_pool.iloc[query_index], self.y_pool[query_index]
    
    def uncertainty_based_query(self, clf, populat_average: int|float) -> tuple[pd.DataFrame, pd.Series]:

        proximity_to_average = clf.predict(self.X_pool) - populat_average # the smaller, the most uncertain (in a way)

        return self.X_pool.loc[np.argmin(proximity_to_average)]

    def variance_based_query(self, clf, uncertainty_measure) -> tuple[pd.DataFrame, pd.Series]:


        return self.X_pool.loc[0]


from sklearn.ensemble import RandomForestClassifier
from sksurv.ensemble import RandomSurvivalForest

rsf = RandomSurvivalForest(n_estimators=5, max_depth=5, random_state=0)
rsf.fit(X_train, y_train)

# in theory, loop along here:
pop_average = rsf.predict(X_train).mean()

ActiveLearn = SamplingQuery(X_train, y_train, X_pool, y_pool) #also y_train, y_pool


In [99]:
rsf.fit(X_train, y_train)
print("N* instances:", X_train.shape)
print("performance:", rsf.score(X_test, y_test))
X_queried, y_queried = ActiveLearn.random_sampling_query(random_state=0)
print(X_queried, y_queried)
ActiveLearn.update_train_pool(X_queried, y_queried) # df_queried should also privde the queried label

rsf.fit(X_train, y_train)
print("N* instances:", X_train.shape)
print("N* in pool:", X_pool.shape)
print("performance:", rsf.score(X_test, y_test))

N* instances: (20, 9)
performance: 0.49333333333333335
age                     69.000
pericardialeffusion      0.000
fractionalshortening     0.150
epss                    12.000
lvdd                     5.390
wallmotion-score        19.500
wallmotion-index         1.625
mult                     0.857
group                    1.000
Name: 40, dtype: float64 (False, 0.75)
(20,)
()


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 0 dimension(s)