# Code for LAL_RL

In [1]:
import tensorflow as tf
from tensorflow.keras import Sequential, layers
from tensorflow.keras.optimizers import Adam
from keras.utils.vis_utils import plot_model
import numpy as np
import matplotlib.pyplot as plt
import copy
import random

In [2]:
class DQN:
    def __init__(self, candidate_size, learning_rate=0.0001):
        self.q_network = Sequential(
            [
                layers.Dense(80, input_shape=(candidate_size+3,), activation="sigmoid", name="layer1"),
                layers.Dense(80, activation="sigmoid", name="layer2"),
                layers.Dense(1, name="layer3"),
            ]
        )
        self.q_network.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['accuracy'])
        
    def fit(self, X, y):
        self.q_network.fit(X,y)
        
    def predict(self, X):
        return self.q_network.predict(X)

    def get_weights(self):
        return self.q_network.get_weights()
    
    def set_weights(self, weights):
        self.q_network.set_weights(weights)

In [3]:
class Environment:
    """
    this class creates states and actions from the given parameters
    acording to section 3.2.1 and 3.2.2 of the paper
    """
    def __init__(self, distance, measure="zero_probability"):
        self.measure = measure
        self.distance = distance
    
    def create_state(self, data, model):
        if self.measure=="zero_probability":
            prediction = model.predict_proba(data)
            state_vec = np.array([x[0] for x in prediction])
            return np.sort(state_vec)
        else:
            raise ValueError(f"provided measure '{self.measure}' is not supported")
        
    def create_action(self, sample, model, labeled_set, unlabeled_set):
        if self.measure=="zero_probability":
            score = model.predict_proba([sample])[0][0]
        else:
            raise ValueError(f"provided measure '{self.measure}' is not supported")
        
        #calculate distance to labeled data set
        dis_lab = sum([self.distance(sample, x) for x in labled_set]) / len(labeled_set)
        
        #calculate distance to unlabeled data set
        dis_unlab = sum([self.distance(sample, x) for x in unlabled_set]) / len(unlabeled_set)
        
        return np.array([score, dis_lab, dis_unlab])

In [4]:
class Agent:
    def __init__(self, q_network, discount=0.999):
        self.current_state = None
        self.q_network = q_network
        self.target_network = copy.deepcopy(q_network)
        self.iteration = 0
        self.discount = discount
        
        # TODO: support own parameters for epsilon greedy strategy
        self.strategy = Epsilon_Greedy_Strat(1, 0, 0.001)
        
    def set_starting_state(self, state):
        self.current_state = state
        
    def reset(self):
        self.iteration = 0
        
    def select_action(self, possible_actions):
        self.iteration += 1
        if self.strategy.explore(self.iteration):
            # explore aka pick a random action
            return random.randrange(len(possible_actions))
        else:
            # pick the best action according to the current policy
            input_values = np.array([np.concatenate((self.current_state, x)) for x in possible_actions])
            q_values = self.q_network.predict(input_values)
            return np.argmax(q_values, axis=0)[0]
        
    def train(self, experiences):
        X = np.array([np.concatenate((x[0],x[1])) for x in experiences])
        y = np.array([x[2] for x in experiences])
        
        # add gamma*maxQ(s_t+1, a_t+1) if the state is not an end state
        y = np.array([y[i] + self.discount * self.get_max_q_value_target(experiences[i][3], experiences[i][5])
                     if not experiences[i][4] else y[i] for i in range(len(y))])
        
        self.q_network.fit(X,y)
        
    def get_max_q_value_target(self, state, possible_actions):
        input_values = np.array([np.concatenate((state, x)) for x in possible_actions])
        q_values = self.target_network.predict(input_values)
        return max([x[0] for x in q_values])
    
    def update_target_network(self):
        self.target_network.set_weights(self.q_network.get_weights())
            
        
class Epsilon_Greedy_Strat:
    def __init__(self, start, _min, decay):
        self.start = start
        self._min = _min
        self.decay = decay
        
    def explore(self, interration):
        """
        returns true if the agent should explore and false if the agent should exploit
        """
        return random.random() < max(self.start - self.decay*interration, self._min)

In [5]:
class ReplayMemory:
    """
    stores the experience of agent of the form:
        (state, taken action, reward, new state,
        if new state is end state, possible actions in new state)
    """
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.index = 0
        
    def push(self, elem):
        if len(memory) < self.capacity:
            self.memory.append(copy.deepcopy(elem))
        else:
            self.memory[self.index] = copy.deepcopy(elem)
        
        self.index = (self.index + 1) % self.capacity
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def can_provide(self, size):
        return len(memory) >= size

In [6]:
class LAL_RL:
    # TODO: support multiple Datasets, own replay memory size, other target quality besides accuracy
    def __init__(self, X, y, eval_ratio, candidate_size, initially_labeled, model, target_quality, distance,
                measure="zero_probability", batch_size=32, update_rate=100):
        self.all_data = np.column_stack((X,y))
        
        self.eval_data_ind = round(len(all_data)*eval_ratio)
        self.unlab_data_ind = len(all_data) - initially_labeled
        if self.unlab_data_ind <= self.eval_data_ind:
            raise ValueError("Not enough samples to split the data properly")
        
        self.split_data()

        self.environment = Environment(distance, measure)
        self.agent = Agent(DQN(candidate_size))
        self.model = model
        self.candidate_size = candidate_size
        
        # find out the accuracy when all data is labeled
        tmp_model = copy.deepcopy(model)
        tmp_model.fit(X[eval_data_ind:], y[eval_data_ind:])
        pred = tmp_model.predict(X[:eval_data_ind])
        equality = tf.math.equal(pred, y[:eval_data_ind])
        accuracy = tf.math.reduce_mean(tf.cast(equality, tf.float32))
        
        # set target quality to be a portion of the accuracy when all data is labeled
        self.target_quality = target_quality * accuracy
        
        self.replay_memory = ReplayMemory(10000)
        self.batch_size = batch_size
        self.update_rate = update_rate
        
    def learn_q_function(self,episodes):
        for i in episodes:
            self.learn_episode()
            
            if i % self.update_rate == 0:
                # copy the weight of the q_network to the target network
                self.agent.update_target_network()
                
            self.split_data()
            self.agent.reset()
            
    def learn_episode(self):
        model = copy.deepcopy(self.model)
        done = False
        X,y = self.get_X_y(self.lab_data)
        model.fit(X,y)
        
        V, idx = self.random_2D_sample(self.unlab_data, self.candidate_size)
        V_X, V_y = self.get_X_y(V)
        state = self.environment.create_state(V_X, model)
        self.agent.set_starting_state(state)
        
        possible_actions = [self.environment.create_action(x, model, self.lab_data, self.unlab_data)
                               for x in V_X]
        
        while not done:
            # choose an action
            action_idx = self.agent.select_action(possible_actions)
            action = possible_actions[action_idx]
            
            # update labeled and unlabeled dataset
            self.label_sample(idx[action_idx])
            
            # retrain the model
            X,y = self.get_X_y(self.lab_data)
            model.fit(X,y)
            
            # create the next state
            V, idx = self.random_2D_sample(self.unlab_data, self.candidate_size)
            V_X, V_y = self.get_X_y(V)
            new_state = self.environment.create_state(V_X, model)
            
            # update possible actions
            possible_actions = [self.environment.create_action(x, model, self.lab_data, self.unlab_data)
                               for x in V_X]
            
            # calculate accuracy for the newly trained model
            X_test, y_test = self.get_X_y(self.eval_data)
            pred = model.predict(X_test)
            equality = tf.math.equal(pred, y_test)
            accuracy = tf.math.reduce_mean(tf.cast(equality, tf.float32))
            
            # test if target quality has been reached or if there is not enough unlabeled data
            done = accuracy >= self.target_quality
            done |= len(self.unlab_data) <= self.candidate_size
            
            # safe the experience
            self.replay_memory.push((state, action, -1, new_state, done, possible_actions))
            
            # update state
            state = new_state
            self.agent.current_state = state
            
            # sample a batch (if possible) and train the DQN
            # TODO: consider the TD-Error when sampling
            if self.replay_memory.can_provide(self.batch_size):
                self.agent.train(self.replay_memory.sample(self.batch_size))
            
    
    def random_2D_sample(array, size):
        """
        sample a total of 'size' random 1-D arrays out of a 2-D array
        """
        if size > len(array):
            raise ValueError("Tried to sample more data than existed")
        idx = np.random.randint(len(array), size=size)
        return array[idx,:], idx
    
    def get_X_y(self, data):
        """
        if X and y are united in data, return X and y seperated
        """
        n_features = len(data[0]) - 1
        return np.delete(data, n_features, 1), np.delete(data, range(n_features), 1)
    
    def label_sample(self, idx):
        """
        moves the sample self.unlab_data[idx] to self.lab_data
        """
        sample = self.unlab_data[idx]
        self.unlab_data = np.delete(self.unlab_data, idx, 0)
        self.lab_data = np.concatenate((self.lab_data, np.expand_dims(sample, axis=0)))
        
    def split_data(self):
        """
        distributes the data among evaluation, unlabeled and labeled data
        """
        np.random.shuffle(self.all_data)
        self.eval_data = all_data[:self.eval_data_ind]
        self.unlab_data = all_data[self.eval_data_ind:self.unlab_data_ind]
        self.lab_data = all_data[self.unlab_data_ind:]

# Only for testing purposes

In [27]:
a = np.array([1,2,3,4])
b = np.array([True, False, False, True])

np.array([a[i]+1 if b[i] else a[i] for i in range(len(a))])

array([2, 2, 3, 5])

In [28]:
X = np.array([[2,3,4],
              [1,5,3],
              [5,3,3],
              [5,3,3],
              [5,3,4],
              [2,7,9],
              [5,3,1],
              [7,1,4],
              [3,2,9],
              [2,2,1]])
y = np.array([0,0,1,1,0,0,1,1,1,1])
pred = np.array([0,1,1,1,0,0,1,1,0,1])

5%100

# np.concatenate((X, np.expand_dims(np.array([1,1,1]), axis=0)))
# test = LAL_RL(X,y,0.5,3,2,None,0.98, lambda x,y:(x-y)**2)

5

In [26]:
network = Sequential(
            [
                layers.Dense(5, input_shape=(3,), activation="sigmoid", name="layer1"),
#                 layers.Dense(80, activation="sigmoid", name="layer2"),
                layers.Dense(1, name="layer3"),
            ]
        )
# print(network.get_weights())
# plot_model(network, show_shapes=True, show_layer_names=True)
pred = network.predict(np.array([np.array([1,2,3]),np.array([3,2,1]),np.array([5,2,4])]))
print(pred)
max([x[0] for x in pred])

[[-0.5097586 ]
 [-0.07278678]
 [-0.1229156 ]]


-0.07278678