In [1]:
import sys
import os
import pandas as pd

notebook_dir = os.getcwd()
sys.path.append(os.path.join(notebook_dir, "..", "code"))
import utils

# Import DataFrames and preproc

In [None]:
df_im = pd.read_parquet("../sourcedata/clips_metadata_from_im.parquet")
df_base = pd.read_parquet("../sourcedata/clips_metadata_with_patterns.parquet")
df_im = df_im.rename(columns={"StateClipCode": "ClipCode"})

df_base['Cleared'] = df_base['Cleared'].map({'True': 1, 'False': 0})
df = pd.concat([df_base, df_im], ignore_index=True)
df['Scene'] = df['Scene'].astype(int)
df['Average_speed'] = df['Average_speed'].astype(float)
df['Duration'] = df['Duration'].astype(float)
df['X_Traveled'] = df['X_Traveled'].astype(float)

df_variables_hum = pd.read_parquet("../sourcedata/df_variables_hum.parquet")
df_variable_ppo= pd.read_parquet("../sourcedata/df_variables_ppo.parquet")

df_variables = pd.concat([df_variables_hum, df_variable_ppo], ignore_index=True)

df = df.read_parquet("../sourcedata/df_metrics.parquet")


### Complet DataFrames

In [3]:
mask_im = df['Model'].str.startswith('sub-0')
df.loc[mask_im, 'Learning_Phase'] = df.loc[mask_im, 'Model'].str[:]
df.loc[mask_im, 'Subject'] = "im_"+df.loc[mask_im, 'Model'].str[:6]
df.loc[mask_im, 'SceneFullName'] = df.loc[mask_im, 'World'].astype(str)+'-'+df.loc[mask_im, 'Level'].astype(str)+'-'+df.loc[mask_im, 'Scene'].astype(str)

mask_ppo = df['Model'].str.startswith('ep')
df.loc[mask_ppo, 'Learning_Phase'] = df.loc[mask_ppo, 'Model'].str[:]
df.loc[mask_ppo, 'Subject'] = "ppo"
df.loc[mask_ppo, 'Average_speed'] = df.loc[mask_ppo, 'X_Traveled'] / df.loc[mask_ppo, 'Duration']

df_variables['player_x_pos'] = df_variables['player_x_posHi']*255 + df_variables['player_x_posLo']
df_variables['Scene'] = df_variables['Scene'].astype(int)

### Keep scenes where every player has done at least one attempt

In [4]:
df_meta = df[df['Subject'].str.startswith('sub-')].groupby(['SceneFullName'])

full_scenes = []
scenes_to_drop = []
for scene, df_scene in df_meta:
    if df_scene['Subject'].nunique() < 5:
        scenes_to_drop.append(scene if isinstance(scene, str) else scene[0])
    else:
        full_scenes.append(scene if isinstance(scene, str) else scene[0])

df = df[~df['SceneFullName'].isin(scenes_to_drop)]
df_variables = df_variables[~df_variables['SceneFullName'].isin(scenes_to_drop)]
print(f"Number of full scenes: {df['SceneFullName'].nunique()}")

Number of full scenes: 290


In [5]:
def mean_delta_by_scene(df, metrics, sub_types):

    df_metrics = pd.DataFrame(index=df['SceneFullName'].sort_values().unique())

    for sub_type in sub_types:

        if sub_type == 'hum':
            df_meta = df[df['Model']=='human'].groupby(['SceneFullName'])
            order_phases = ["Early discovery", "Late discovery", "Early practice", "Late practice"]
            phase_to_idx = {p: i % 4 for i, p in enumerate(order_phases)}

        elif sub_type == 'ppo':
            df_meta = df[df['Model'].str.startswith('ep')].groupby(['SceneFullName'])
            order_phases = ['ep-20', 'ep-2000', 'ep-4000', 'ep-6000', 'ep-8000']
            phase_to_idx = {p: i % 5 for i, p in enumerate(order_phases)}

        elif sub_type == 'im':
            df_meta = df[df['Subject'].str.startswith('im')].groupby(['SceneFullName'])
            ppos_phases = [['sub-01_epoch=0-step=500', 'sub-01_epoch=0-step=2000', 'sub-01_epoch=0-step=3500', 'sub-01_epoch=0-step=5000', 'sub-01_epoch=0-step=6500'],
                            ['sub-02_epoch=0-step=500', 'sub-02_epoch=0-step=3000', 'sub-02_epoch=0-step=5500', 'sub-02_epoch=0-step=8000', 'sub-02_epoch=0-step=10000'],
                            ['sub-03_epoch=0-step=500', 'sub-03_epoch=0-step=4000', 'sub-03_epoch=0-step=7500', 'sub-03_epoch=1-step=11408', 'sub-03_epoch=1-step=14908'],
                            ['sub-05_epoch=0-step=500', 'sub-05_epoch=0-step=1500', 'sub-05_epoch=0-step=3000', 'sub-05_epoch=0-step=4000', 'sub-05_epoch=0-step=5000'],
                            ['sub-06_epoch=0-step=500', 'sub-06_epoch=0-step=2000', 'sub-06_epoch=0-step=4000', 'sub-06_epoch=0-step=5500', 'sub-06_epoch=0-step=7000']]
            phase_to_idx = {p: i % 5 for i, p in enumerate([phase for subset in ppos_phases for phase in subset])}

        columns = [f"delta_mean_{sub_type}_{metric}" for metric in metrics]
        indexs = list(df_meta.groups.keys())
        df_deltas_tot = pd.DataFrame(columns=columns, index=indexs)
        
        for metric in metrics:
            for scene, df_scene in df_meta:
                df_clearance = df_scene.groupby(["Subject", "Learning_Phase"])[metric].mean().reset_index(name=metric)
                df_delta_tot = utils.compute_delta_tot(df_clearance, phase_to_idx, metric)
                scene_name = scene[0]
                col = f"delta_mean_{sub_type}_{metric}"
                df_deltas_tot.loc[scene_name, col] = df_delta_tot['delta_tot'].mean().item()

        df_metrics = pd.concat([df_metrics, df_deltas_tot], axis=1)
    return df_metrics

def variance_by_scene_var(df, sub_types):
    
    df_metrics = pd.DataFrame(index=df['SceneFullName'].sort_values().unique())

    for sub_type in sub_types:
        if sub_type == 'hum':
            df_var = df_variables[df_variables['Subject'].str.startswith('sub-')].groupby(['SceneFullName'])
        elif sub_type == 'ppo':
            df_var = df_variables[df_variables['Subject'].str.startswith('ppo')].groupby(['SceneFullName'])
        elif sub_type == 'im':
            df_var = df_variables[df_variables['Subject'].str.startswith('im')].groupby(['SceneFullName'])
    
        col = [f"MAD_mean_{sub_type}"]
        indexs = list(df_var.groups.keys())
        df_deltas_tot = pd.DataFrame(columns=col, index=indexs)

        for scene, df_scene in df_var:
            df_delta_tot = utils.get_mads(df_scene).groupby('Subject')["MAD_mean"].mean().reset_index(name='MAD_mean')
            scene_name = scene[0]
            df_deltas_tot.loc[scene_name, col[0]] = df_delta_tot[f"MAD_mean"].mean().item()

        df_metrics = pd.concat([df_metrics, df_deltas_tot], axis=1)

    return df_metrics

In [None]:
df_meta_mean = mean_delta_by_scene(df, 
                              metrics=['Cleared', 'Average_speed'], 
                              sub_types=['hum', 'ppo', 'im'])
df_test = variance_by_scene_var(df_variables, 
                              sub_types=['hum', 'ppo', 'im'])
df_meta_mean.head(10)

Unnamed: 0,MAD_mean_hum,MAD_mean_ppo,MAD_mean_im
1-1-0,1.099498,0.598918,
1-1-1,9.668016,18.252898,
1-1-10,5.553712,7.66834,
1-1-11,4.566335,22.358091,
1-1-13,7.227495,8.968284,


In [7]:
df_total = pd.concat([df_meta_mean, df_test], axis=1)
df_total.head()

Unnamed: 0,delta_mean_hum_Cleared,delta_mean_hum_Average_speed,delta_mean_ppo_Cleared,delta_mean_ppo_Average_speed,delta_mean_im_Cleared,delta_mean_im_Average_speed,MAD_mean_hum,MAD_mean_ppo,MAD_mean_im
1-1-0,0.0,-0.400554,0.0,14.225168,-0.767514,-42.861067,1.099498,0.598918,
1-1-1,0.023333,5.380366,0.341317,50.466879,-0.120981,-3.099172,9.668016,18.252898,
1-1-10,0.0,15.512265,-0.025,18.316488,0.195431,6.402998,5.553712,7.66834,
1-1-11,0.013333,27.685205,0.058442,21.99656,0.229667,32.466397,4.566335,22.358091,
1-1-13,0.166667,9.901358,0.120805,25.013137,0.048643,-8.13141,7.227495,8.968284,


In [10]:
best_s_hum_clr = set(df_total.sort_values(by='delta_mean_hum_Cleared', ascending=False).index[0:58])
best_s_ppo_clr = set(df_total.sort_values(by='delta_mean_ppo_Cleared', ascending=False).index[0:58])
best_s_im_clr = set(df_total.sort_values(by='delta_mean_im_Cleared', ascending=False).index[0:58])
worst_s_hum_clr = set(df_total.sort_values(by='delta_mean_hum_Cleared', ascending=True).index[0:58])
worst_s_ppo_clr = set(df_total.sort_values(by='delta_mean_ppo_Cleared', ascending=True).index[0:58])
best_s_hum_mad = set(df_total.sort_values(by='MAD_mean_hum', ascending=True).index[0:58])


In [9]:
print("Best scenes by human clearance:", best_s_hum_clr)

Best scenes by human clearance: {'4-1-7', '5-2-6', '8-1-17', '4-3-2', '4-1-1', '2-3-12', '5-2-12', '5-2-14', '1-1-9', '5-3-3', '8-3-4', '6-3-8', '3-1-7', '5-2-4', '1-3-4', '1-3-5', '6-1-5', '4-3-4', '1-1-13', '1-3-6', '2-3-11', '5-1-4', '7-3-11', '3-2-9', '8-2-7', '6-2-11', '3-2-3', '5-2-10', '5-2-2', '2-3-10', '5-3-7', '7-1-4', '2-3-8', '2-1-13', '8-1-16', '6-3-3', '1-3-3', '3-3-2', '6-3-4', '6-3-7', '6-1-10', '2-3-7', '3-3-3', '3-1-8', '3-1-1', '7-3-12', '4-3-3', '6-2-9', '6-2-13', '8-2-12', '8-2-13', '6-2-12', '3-2-5', '5-1-12', '5-3-6', '4-3-1', '6-3-9', '5-2-3'}


In [11]:
print("Best scenes by human clearance:", best_s_hum_mad)

Best scenes by human clearance: {'4-1-0', '3-1-0', '5-3-8', '1-3-0', '2-3-6', '2-3-12', '8-1-14', '8-3-8', '5-1-9', '5-3-5', '1-1-0', '4-3-0', '8-1-3', '8-2-0', '6-1-6', '5-2-0', '6-1-9', '4-2-2', '5-1-0', '4-2-16', '5-3-0', '8-1-24', '8-3-9', '8-1-10', '1-3-1', '2-3-10', '2-3-3', '5-1-5', '4-2-1', '4-1-9', '8-1-22', '3-3-9', '8-1-25', '3-2-8', '6-1-0', '2-1-0', '8-3-0', '1-3-8', '7-1-0', '4-1-2', '2-3-2', '8-1-16', '3-2-0', '8-3-2', '6-3-0', '8-1-21', '7-3-1', '8-3-10', '6-2-0', '7-1-10', '3-3-0', '2-3-0', '2-3-1', '7-3-12', '8-1-0', '5-1-2', '7-3-0', '7-1-6'}
