# Fully modularized experience object

- The purpose of this object is to make a flexible environment that can consist of many stages.
- Each stage contains a list of tasks with a smooth transition of task sampling probability.
- Each task contain a progress counter that determine the speed of corpus opening.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from typing import List

In [None]:
class Task:
    """ A single task """
    def __init__(self, name: str, progress_start: float, progress_slope: float):
        """
        progress_start: starting location
        progress_slope: slope progress in %/sample
        """
        self.name = name
        self.progress_start = progress_start
        self.progress_slope = progress_slope

    def get_progress(self, sample: float) -> float:
        """ Return the progress of a given sample in percentage"""
        progress = self.progress_start + sample * self.progress_slope
        return min(progress, 100)

In [None]:
p_start = 2
p_slope = 1/40000

n1 = 1_800_000
n2 = 800_000
n3 = 52_000_000

o_start2 = p_start + n1 * p_slope
o_start3 = o_start2 + n2 * p_slope
r_start3 = p_start + n2 * p_slope

In [None]:

# Stage 1: Oral
pho_sem_1 = Task(name="pho_sem", progress_start=p_start, progress_slope=p_slope)
sem_pho_1 = Task(name="sem_pho", progress_start=p_start, progress_slope=p_slope)
sem_sem_1 = Task(name="sem_sem", progress_start=p_start, progress_slope=p_slope)
pho_pho_1 = Task(name="pho_pho", progress_start=p_start, progress_slope=p_slope)

# Stage 2: Transition
pho_sem_2 = Task(name="pho_sem", progress_start=o_start2, progress_slope=p_slope)
sem_pho_2 = Task(name="sem_pho", progress_start=o_start2, progress_slope=p_slope)
sem_sem_2 = Task(name="sem_sem", progress_start=o_start2, progress_slope=p_slope)
pho_pho_2 = Task(name="pho_pho", progress_start=o_start2, progress_slope=p_slope)
ort_pho_2 = Task(name="ort_pho", progress_start=p_start, progress_slope=p_slope)
ort_sem_2 = Task(name="ort_sem", progress_start=p_start, progress_slope=p_slope)
triangle_2 = Task(name="triangle", progress_start=p_start, progress_slope=p_slope)

# Stage 3: Reading
pho_sem_3 = Task(name="pho_sem", progress_start=o_start3, progress_slope=p_slope)
sem_pho_3 = Task(name="sem_pho", progress_start=o_start3, progress_slope=p_slope)
sem_sem_3 = Task(name="sem_sem", progress_start=o_start3, progress_slope=p_slope)
pho_pho_3 = Task(name="pho_pho", progress_start=o_start3, progress_slope=p_slope)
ort_pho_3 = Task(name="ort_pho", progress_start=r_start3, progress_slope=p_slope)
ort_sem_3 = Task(name="ort_sem", progress_start=r_start3, progress_slope=p_slope)
triangle_3 = Task(name="triangle", progress_start=r_start3, progress_slope=p_slope)


In [None]:

class Stage:
    def __init__(self, name: str, tasks: List[Task], stage_sample: int, task_probability_start: List[float], task_probability_end: List[float]):
        self.name = name
        self.tasks = tasks
        self.stage_sample = stage_sample
        self.task_probability_start = task_probability_start
        self.task_probability_end = task_probability_end

    def get_task_probability(self, sample: int) -> float:
        """ Return the probability of a task at a given sample """
        interpolate = lambda sample, start, end: np.interp(sample, xp=[0, self.stage_sample], fp=[start, end])
        p = [interpolate(sample, start, end) for start, end in zip(self.task_probability_start, self.task_probability_end)]
        return p

    def draw_task(self, sample:int) -> Task:
        """ Return the task to be executed """
        p = self.get_task_probability(sample)
        task = np.random.choice(self.tasks, p=p)
        return task





In [None]:
oral_stage = Stage(
    name="oral", 
    tasks=[pho_sem_1, sem_pho_1, sem_sem_1, pho_pho_1], 
    stage_sample=1_800_000, 
    task_probability_start=[0.4, 0.4, 0.1, 0.1], 
    task_probability_end=[0.4, 0.4, 0.1, 0.1]
    )

transition_stage = Stage(
    name="transition", 
    tasks=[pho_sem_2, sem_pho_2, sem_sem_2, pho_pho_2, ort_pho_2, ort_sem_2, triangle_2],
    stage_sample=800_000, 
    task_probability_start=[0.4, 0.4, 0.1, 0.1, 0, 0, 0], 
    task_probability_end=[0.2, 0.2, 0.05, 0.05, 0.1, 0.1, 0.3]
    )
    
reading_stage = Stage(
    name="reading", 
    tasks=[pho_sem_3, sem_pho_3, sem_sem_3, pho_pho_3, ort_pho_3, ort_sem_3, triangle_3],
    stage_sample=52_000_000,
    task_probability_start=[0.2, 0.2, 0.05, 0.05, 0.1, 0.1, 0.3],
    task_probability_end=[0.2, 0.2, 0.05, 0.05, 0.1, 0.1, 0.3])

In [None]:

class Experience:
    def __init__(self, stages: List[Stage]):
        self.stages = stages
        self.n_stages = len(stages)
        self.total_sample = sum(x.stage_sample for x in self.stages)

    def plot_corpus(self, scale_x=1000):
        """Plot the corpus opening progress in each stage"""
        fig, ax = plt.subplots(1, self.n_stages, figsize=(7*self.n_stages,5))

        for i, stage in enumerate(self.stages):
            samples = stage.stage_sample
            for j, task in enumerate(stage.tasks):
                y = [task.get_progress(sample*scale_x) for sample in range(int(samples/scale_x))]
                ax[i].plot(y, label=task.name)
                ax[i].legend(loc="lower right")
                ax[i].set_title(f'{j}: Corpus opening progression (%) in {stage.name}')
                ax[i].set_xlabel(f'sample x {scale_x}')
                ax[i].set_ylabel('Progress (%)')
                ax[i].set_ylim([0,101])
        
    def plot_task_probability(self, scale_x=1000):
        """Plot task probability in each stage"""
        fig, ax = plt.subplots(1, self.n_stages, figsize=(7*self.n_stages,5))

        for i, stage in enumerate(self.stages):
            samples = stage.stage_sample
            ps = [stage.get_task_probability(sample*scale_x) for sample in range(int(samples/scale_x))]
            
            for j, task in enumerate(stage.tasks):
                task_p = [p[j] for p in ps] 
                ax[i].plot(task_p, label=task.name)
                ax[i].legend(loc="lower right")
                ax[i].set_title(f'{j}: Sampling probability in {stage.name}')
                ax[i].set_xlabel(f'sample x {scale_x}')
                ax[i].set_ylabel('Probability')
                ax[i].set_ylim([0,1])


    def get_stage(self, sample:int) -> [Stage, int]:
        """Get the current stage and sample_at_stage by no. sample ingested
        Stage: Stage object
        sample_at_stage: the no. of sample counted from the start of a stage
        """
        cumulative_sample = 0
        sample_at_stage = sample

        for stage in self.stages:
            if sample <= cumulative_sample + stage.stage_sample:
                return stage, sample_at_stage
            else:
                cumulative_sample += stage.stage_sample
                sample_at_stage = sample - cumulative_sample



In [None]:
exp = Experience([oral_stage, transition_stage, reading_stage])

In [None]:
exp.plot_corpus()

In [None]:
exp.plot_task_probability()

In [None]:

class Sampler:
    """v3 Sampler for modularized environment
    Features: Fully modularized envrionment staging
    """

    def __init__(self, cfg, data, experience):

        # Get necessary environment config from cfg object
        self.cfg = cfg
        self.wf_clip_low = cfg.wf_clip_low
        self.wf_clip_high = cfg.wf_clip_high
        self.wf_compression = cfg.wf_compression
        self.batch_size = cfg.batch_size
        self.n_timesteps = cfg.n_timesteps
        self.inject_error_ticks = cfg.inject_error_ticks
        
        np.random.seed(cfg.rng_seed)
        
        self.data = data
        self.current_sample = 0

        self.experience = experience

        # Basic convienient variables
        self._calculate_aux_variables()


    def _calculate_aux_variables(self):

        # Word frequency related
        if self.wf_clip_low is None:
            self.wf_clip_low = 0
        
        if self.wf_clip_high is None:
            self.wf_clip_high = 999999999            
            
        clip_wf = self.data.df_train.wf.clip(self.wf_clip_low, self.wf_clip_high).copy()
        self.rank_pct_wf = clip_wf.rank(pct=True, ascending=False)
        self.rank_pct_wf_dict = dict(zip(self.data.df_train.word, self.rank_pct_wf))

        assert self.wf_compression in ('log', 'root')
        self.compressed_wf = np.log(clip_wf + 1) if self.wf_compression == 'log' else np.sqrt(clip_wf)


    def wf_to_ps(self, wf):
        """convert squashed compressed word frequncy to probabilty"""
        return np.array(wf/np.sum(wf), dtype="float32")

    def sample_to_batch(self, sample):
        """Convert sample to batch in 0-indexing format"""
        return int(sample/self.batch_size)

    def get_sampling_p(self, task, stage_sample):
        """Get sampling probability for a task"""
        progress = task.get_progress(stage_sample)
        print(f"Current progress is: {progress}")      
        # Create selection mask
        mask = self.rank_pct_wf < progress
        return self.wf_to_ps(self.compressed_wf * mask)

    def generator(self):
        """ Generator that draw task and sample idx 
        """
        x_ticks = self.n_timesteps
        y_ticks = self.inject_error_ticks
        
        while True:
            stage, stage_sample = self.experience.get_stage(self.current_sample)
            task = stage.draw_task(stage_sample)
            idx = np.random.choice(self.data.df_train.index, self.batch_size, p=self.get_sampling_p(task, stage_sample))
            
            x, y = modeling.IN_OUT[task.name]
            batch_x = [self.data.np_representations[x][idx]] * x_ticks

            # Check if multiple output or not
            if type(y) is list:
                    batch_y = {yi: [self.data.np_representations[yi][idx]] * y_ticks for yi in y}
            else:
                batch_y = [self.data.np_representations[y][idx]] * y_ticks
            
            self.current_sample += self.batch_size
            yield task.name, idx, batch_x, batch_y
     


In [None]:
import meta, data_wrangling, modeling
data = data_wrangling.MyData()
cfg = meta.ModelConfig.from_json("./models/triangle_hope1/model_config.json")

In [None]:
sampler = Sampler(cfg, data, exp)

In [None]:
g = sampler.generator()

In [None]:
next(g)