In [1]:
import sys
import os
import pandas as pd

notebook_dir = os.getcwd()
sys.path.append(os.path.join(notebook_dir, "..", "code"))
import utils

# Import DataFrames and preproc

In [2]:
df_im = pd.read_parquet("../sourcedata/clips_metadata_from_im.parquet")
df_base = pd.read_parquet("../sourcedata/clips_metadata_with_patterns.parquet")
df_im = df_im.rename(columns={"StateClipCode": "ClipCode"})

df_base['Cleared'] = df_base['Cleared'].map({'True': 1, 'False': 0})
df = pd.concat([df_base, df_im], ignore_index=True)
df['Scene'] = df['Scene'].astype(int)
df['Average_speed'] = df['Average_speed'].astype(float)
df['Duration'] = df['Duration'].astype(float)
df['X_Traveled'] = df['X_Traveled'].astype(float)

df_variables_hum = pd.read_parquet("../sourcedata/df_variables_hum.parquet")
df_variable_ppo= pd.read_parquet("../sourcedata/df_variables_ppo.parquet")

df_variables = pd.concat([df_variables_hum, df_variable_ppo], ignore_index=True)



### Complet DataFrames

In [3]:
mask_im = df['Model'].str.startswith('sub-0')
df.loc[mask_im, 'Learning_Phase'] = df.loc[mask_im, 'Model'].str[:]
df.loc[mask_im, 'Subject'] = "im_"+df.loc[mask_im, 'Model'].str[:6]
df.loc[mask_im, 'SceneFullName'] = df.loc[mask_im, 'World'].astype(str)+'-'+df.loc[mask_im, 'Level'].astype(str)+'-'+df.loc[mask_im, 'Scene'].astype(str)

mask_ppo = df['Model'].str.startswith('ep')
df.loc[mask_ppo, 'Learning_Phase'] = df.loc[mask_ppo, 'Model'].str[:]
df.loc[mask_ppo, 'Subject'] = "ppo"
df.loc[mask_ppo, 'Average_speed'] = df.loc[mask_ppo, 'X_Traveled'] / df.loc[mask_ppo, 'Duration']

df_variables['player_x_pos'] = df_variables['player_x_posHi']*255 + df_variables['player_x_posLo']
df_variables['Scene'] = df_variables['Scene'].astype(int)

### Keep scenes where every player has done at least one attempt

In [4]:
df_meta = df[df['Subject'].str.startswith('sub-')].groupby(['SceneFullName'])

full_scenes = []
scenes_to_drop = []
for scene, df_scene in df_meta:
    if df_scene['Subject'].nunique() < 5:
        scenes_to_drop.append(scene if isinstance(scene, str) else scene[0])
    else:
        full_scenes.append(scene if isinstance(scene, str) else scene[0])

df = df[~df['SceneFullName'].isin(scenes_to_drop)]
df_variables = df_variables[~df_variables['SceneFullName'].isin(scenes_to_drop)]
print(f"Number of full scenes: {df['SceneFullName'].nunique()}")

Number of full scenes: 290


In [13]:
def mean_delta_by_scene(df, metrics, sub_types):

    df_metrics = pd.DataFrame(index=df['SceneFullName'].sort_values().unique())

    for sub_type in sub_types:

        if sub_type == 'hum':
            df_meta = df[df['Model']=='human'].groupby(['SceneFullName'])
            order_phases = ["Early discovery", "Late discovery", "Early practice", "Late practice"]
            phase_to_idx = {p: i % 4 for i, p in enumerate(order_phases)}

        elif sub_type == 'ppo':
            df_meta = df[df['Model'].str.startswith('ep')].groupby(['SceneFullName'])
            order_phases = ['ep-20', 'ep-2000', 'ep-4000', 'ep-6000', 'ep-8000']
            phase_to_idx = {p: i % 5 for i, p in enumerate(order_phases)}

        elif sub_type == 'im':
            df_meta = df[df['Subject'].str.startswith('im')].groupby(['SceneFullName'])
            ppos_phases = [['sub-01_epoch=0-step=500', 'sub-01_epoch=0-step=2000', 'sub-01_epoch=0-step=3500', 'sub-01_epoch=0-step=5000', 'sub-01_epoch=0-step=6500'],
                            ['sub-02_epoch=0-step=500', 'sub-02_epoch=0-step=3000', 'sub-02_epoch=0-step=5500', 'sub-02_epoch=0-step=8000', 'sub-02_epoch=0-step=10000'],
                            ['sub-03_epoch=0-step=500', 'sub-03_epoch=0-step=4000', 'sub-03_epoch=0-step=7500', 'sub-03_epoch=1-step=11408', 'sub-03_epoch=1-step=14908'],
                            ['sub-05_epoch=0-step=500', 'sub-05_epoch=0-step=1500', 'sub-05_epoch=0-step=3000', 'sub-05_epoch=0-step=4000', 'sub-05_epoch=0-step=5000'],
                            ['sub-06_epoch=0-step=500', 'sub-06_epoch=0-step=2000', 'sub-06_epoch=0-step=4000', 'sub-06_epoch=0-step=5500', 'sub-06_epoch=0-step=7000']]
            phase_to_idx = {p: i % 5 for i, p in enumerate([phase for subset in ppos_phases for phase in subset])}

        columns = [f"delta_mean_{sub_type}_{metric}" for metric in metrics]
        indexs = list(df_meta.groups.keys())
        df_deltas_tot = pd.DataFrame(columns=columns, index=indexs)
        
        for metric in metrics:
            for scene, df_scene in df_meta:
                df_clearance = df_scene.groupby(["Subject", "Learning_Phase"])[metric].mean().reset_index(name=metric)
                df_delta_tot = utils.compute_delta_tot(df_clearance, phase_to_idx, metric)
                scene_name = scene[0]
                col = f"delta_mean_{sub_type}_{metric}"
                df_deltas_tot.loc[scene_name, col] = df_delta_tot['delta_tot'].mean().item()

        df_metrics = pd.concat([df_metrics, df_deltas_tot], axis=1)
    return df_metrics

def variance_by_scene_var(df, sub_types):
    
    df_metrics = pd.DataFrame(index=df['SceneFullName'].sort_values().unique())

    for sub_type in sub_types:
        if sub_type == 'hum':
            df_var = df_variables[df_variables['Subject'].str.startswith('sub-')].groupby(['SceneFullName'])
        elif sub_type == 'ppo':
            df_var = df_variables[df_variables['Subject'].str.startswith('ppo')].groupby(['SceneFullName'])
        elif sub_type == 'im':
            df_var = df_variables[df_variables['Subject'].str.startswith('im')].groupby(['SceneFullName'])
    
        col = [f"MAD_mean_{sub_type}"]
        indexs = list(df_var.groups.keys())
        df_deltas_tot = pd.DataFrame(columns=col, index=indexs)

        for scene, df_scene in df_var:
            df_delta_tot = utils.get_mads(df_scene).groupby('Subject')["MAD_mean"].mean().reset_index(name='MAD_mean')
            scene_name = scene[0]
            df_deltas_tot.loc[scene_name, col[0]] = df_delta_tot[f"MAD_mean"].mean().item()

        df_metrics = pd.concat([df_metrics, df_deltas_tot], axis=1)

    return df_metrics

In [14]:
df_meta_mean = mean_delta_by_scene(df, 
                              metrics=['Cleared', 'Average_speed'], 
                              sub_types=['hum', 'ppo', 'im'])
df_test = variance_by_scene_var(df_variables, 
                              sub_types=['hum', 'ppo', 'im'])
df_test.head()

Unnamed: 0,MAD_mean_hum,MAD_mean_ppo,MAD_mean_im
1-1-0,1.099498,0.598918,
1-1-1,9.668016,18.252898,
1-1-10,5.553712,7.66834,
1-1-11,4.566335,22.358091,
1-1-13,7.227495,8.968284,


## Top Mean MAD for Humans

In [None]:


df_deltas_tot = df_deltas_tot.sort_values('MAD_mean', ascending=False)
hum_top_mad = df_deltas_tot.index[:range].tolist()
#df_deltas_tot.head(5)

## Top Mean MAD for the PPO

In [None]:


df_deltas_tot = df_deltas_tot.sort_values('MAD_mean', ascending=False)
ppo_top_mad = df_deltas_tot.index[:range].tolist()
#df_deltas_tot.head(5)