# prepare

In [1]:
# packages
import sys
sys.path.append("/home/xuyue/mvit_clean/")
# print(sys.path)
import os
import os.path as osp
from collections import defaultdict, Counter
import multiprocessing as mproc
import tqdm
import json
import glob
import pickle
import numpy as np
import torch
import cv2
from PIL import Image
import matplotlib.pyplot as plt
# local code
from configs.defaults import get_cfg
from datasets.build import build_dataset

In [2]:
# data_dir = [
#     ["salads", "FiftySalads"],
#     ["egtea", "EgteaGaze"],
#     ["charades", "CharadesEgo"],
#     ["ego4d", "Ego4dAction"],
#     ["egoclip", "EgoClip_EgoMCQ"],

#     ["Epic", "/ssd/FAST_DATA/epic-kitchens"],
#     ["FiftySalads", "/hdd/DATA/50salad"],
#     ["Breakfast", "/hdd/DATA/breakfast"],
#     ["IKEA", "/hdd/DATA/ikea_asm_dataset_public"],
#     ["EgteaGaze", "/ssd/FAST_DATA/EGTEA_GAZE+"],
#     ["CharadesEgo", "/hdd/DATA/charades-ego"],
#     ["Ego4dAction", "/hdd/DATA/Ego4d/ego4d-fho/"],
#     ["SthElse", "/hdd/DATA/something-else/"],
#     ["EgoClip_EgoMCQ", "/hdd/DATA/Ego4d/ego4d-fho/"],
# ]

data_dir = [
    ["egtea", "EgteaGaze"],
    ["EgteaGaze", "/ssd/FAST_DATA/EGTEA_GAZE+"],
]

def get_dataset_by_name(ds_name, split, num_frame=8, frame_gap=24):
    root = None
    for k, v in data_dir:
        if k.lower() == ds_name.lower():
            if v[0] == "/": # is dir
                root = v
                ds_name = k
            else:
                ds_name = v
            
    cfg = get_cfg()
    cfg.merge_from_list([
        "TRAIN.DATASET", ds_name,
        "TEST.DATASET", ds_name,
        "DATA.PATH_TO_DATA_DIR", root,
        "DATA.NUM_FRAMES", str(num_frame),
        "DATA.FPS", str(frame_gap),
    ])
    dset = build_dataset(ds_name, cfg, split)
    return dset

# Read dataset

In [3]:
train_datasets = {}

for name in [
    # "Epic",
    # "FiftySalads",
    # "Breakfast",
    # "IKEA",
    "EgteaGaze",
#     "CharadesEgo",
    # "Ego4dAction",
    # "SthElse",
]:
    print("loading", name)
    train_datasets[name] = get_dataset_by_name(name, "train")
    print("length =", len(train_datasets[name]))

loading EgteaGaze
length = 8300


# Count

In [4]:
idx_by_verb = {}

for name, dset in train_datasets.items():
    cnt = defaultdict(list)
    for i, x in enumerate(dset.video_records):
        verb = x.verb_str
        # print("verb=",verb)
        if type(verb)==list:
            verb = ",".join(verb)
        cnt[verb].append(i)
    idx_by_verb[name] = cnt
    print(name, len(cnt))

# print(idx_by_verb)

EgteaGaze 106


# Bert

In [5]:
from pytorch_pretrained_bert import BertTokenizer, BertModel
import os
import torch
import tqdm
from collections import defaultdict


os.environ['CUDA_VISIBLE_DEVICES']='3'
os.environ['BERT_BASE_DIR']='/home/xuyue/pretrain_models/'

device = "cuda"
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('/home/xuyue/pretrain_models/bert-base-uncased.tar.gz') #'bert-base-uncased')
bert_model.eval().to(device)

print("loaded")

def extract_bert_feature(sentence):
    
    # Load pre-trained model tokenizer (vocabulary)

    # Tokenized input
    text = f"[CLS] {sentence} [SEP] "
    tokenized_text = tokenizer.tokenize(text)
    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
    segments_ids = [0]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segments_tensors = torch.tensor([segments_ids]).to(device)

    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers, _ = bert_model(tokens_tensor, segments_tensors)
    # We have a hidden states for each of the 12 layers in model bert-base-uncased
    assert len(encoded_layers) == 12
    
    return encoded_layers[0][0, 0, :]   # [1, n_token, dim] -> [dim], use [CLS] embedding

feat = extract_bert_feature("hello world")
print(feat.size())

loaded
torch.Size([768])


In [6]:
verb_stat = {}

for name, cnt in idx_by_verb.items():
    print(name)
    stat = {}
    
    for verb, idx in cnt.items():
        b = extract_bert_feature(verb).cpu().numpy()
        stat[verb] = {
            "count": len(idx),
            "bert": b,
        }
        
    verb_stat[name] = stat

EgteaGaze


# Semantic distance

In [7]:
import numpy as np
import math
import json
from tqdm import tqdm
from scipy.stats import scoreatpercentile
from copy import deepcopy
import os

def _select_sigma(x, percentile=25):
    normalize = 1.349
    IQR = (scoreatpercentile(x, 75) - scoreatpercentile(x, 25)) / normalize
    std_dev = np.std(x, axis=0, ddof=1)
    if IQR > 0:
        return np.minimum(std_dev, IQR)
    else:
        return std_dev

def bw_scott(x, kernel=None):
    A = _select_sigma(x)
    n = len(x)
    return 1.059 * A * n ** (-0.2)

def bw_silverman(x, kernel=None):
    A = _select_sigma(x)
    n = len(x)
    return 0.9 * A * n ** (-0.2)

def logsumexp(x, axis=-1):
    mx = np.max(x, axis=axis)
    return np.log(np.sum(np.exp(x - np.expand_dims(mx, axis)), axis)) + mx

class KdeDensity(object):
    def __init__(self, y, weight=None, sigma="scott"):
        y = np.array(y)
        
        if weight is None:
            weight = np.ones((y.shape[0],))
        elif isinstance(weight, list):
            weight = np.array(weight)
        weight = weight * 1.0 / weight.sum()
        
        if isinstance(sigma, str):
            bw_func = {
                "scott": bw_scott,
                "silverman": bw_silverman,
            }[sigma]
            
            sigma = np.array([bw_func(y[:, i].tolist()) for i in range(y.shape[1])])
        elif isinstance(sigma, int) or isinstance(sigma, float):
            sigma = np.ones((y.shape[1],))*sigma
        elif isinstance(sigma, list):
            sigma = np.array(sigma)

        self.y = y
        self.weight = weight
        self.sigma = sigma
        self.log_const = np.log(sigma * math.sqrt(2*math.pi)).sum()
        
        
    def __logsumexp(self, x, axis=-1):
        mx = np.max(x, axis=axis)
        return np.log(np.sum(np.exp(x - np.expand_dims(mx, axis)), axis)) + mx
    
    def __call__(self, x):
        """estimate log likelihood"""
        x = np.array(x)
        dist = x[:, np.newaxis, :] - self.y[np.newaxis, :, :] # n_query, n_base_sample, dim
        dist = dist / self.sigma[np.newaxis, np.newaxis, :]
        dist = np.log(self.weight)[np.newaxis, :] - 0.5 * np.square(dist).sum(-1)
        dist = self.__logsumexp(dist, axis=1) - self.log_const
        return dist



# Egtea: delete tail class

In [8]:
datasetA = [{
    "verb": v,
    "count": x["count"],
    "bert": x["bert"],
    }
    for v, x in verb_stat["EgteaGaze"].items()
]

datasetB = [{
    "verb": v,
    "count": x["count"],
    "bert": x["bert"],
    "dataset": name,
    }
    for name, stat in verb_stat.items() for v, x in stat.items() if name in ["EgteaGaze"]
]

print(len(datasetA))

bert_a = np.array([x['bert'] for x in datasetA])
count_a = np.array([x['count'] for x in datasetA], dtype=float)

bert_b = np.array([x['bert'] for x in datasetA])
count_b = np.array([x['count'] for x in datasetA], dtype=float)

kde = KdeDensity(bert_a, weight = count_a, sigma=0.5)
logp_b = kde(bert_b)
cls_id_ordered = sorted(range(len(logp_b)), key=lambda i:logp_b[i], reverse=True)

# print(kde)
# print("logp_b=",logp_b)
# print("logp_b.len=",len(logp_b))
# print("cls_id_ordered=",cls_id_ordered)

for percent in [5, 10, 20]:
    print(percent)

    num_to_select = int(np.sum(count_a)*0.01*(100-percent))
    

    cur_index = 0
    selected_sample = []
    num_per_ds = defaultdict(int)
    
    while len(selected_sample) < num_to_select:
        cls_id = cls_id_ordered[cur_index]
        cur_index += 1
        
        dsname = datasetB[cls_id]["dataset"]
        verb = datasetB[cls_id]["verb"]
        all_indices = idx_by_verb[dsname][verb]
        
        np.random.shuffle(all_indices)
        num = min(datasetB[cls_id]["count"], num_to_select-len(selected_sample))
        num_per_ds[dsname] += num
        idx = all_indices[:num]
        selected_sample += [[dsname, x] for x in idx]
    
    
    # path = f"video_property/selection/Egtea_reduction_{percent}p.json"
    path = f"./reduction/Egtea_reduction_{percent}p.json"

    with open(path, "w") as fp:
        json.dump(selected_sample, fp)
    print(len(selected_sample), [(k,v) for k,v in num_per_ds.items()])

106
5
7885 [('EgteaGaze', 7885)]
10
7470 [('EgteaGaze', 7470)]
20
6640 [('EgteaGaze', 6640)]


# Egtea: delete head class

In [9]:
datasetA = [{
    "verb": v,
    "count": x["count"],
    "bert": x["bert"],
    }
    for v, x in verb_stat["EgteaGaze"].items()
]

datasetB = [{
    "verb": v,
    "count": x["count"],
    "bert": x["bert"],
    "dataset": name,
    }
    for name, stat in verb_stat.items() for v, x in stat.items() if name in ["EgteaGaze"]
]

print(len(datasetA))

bert_a = np.array([x['bert'] for x in datasetA])
count_a = np.array([x['count'] for x in datasetA], dtype=float)

bert_b = np.array([x['bert'] for x in datasetA])
count_b = np.array([x['count'] for x in datasetA], dtype=float)

kde = KdeDensity(bert_a, weight = count_a, sigma=0.5)
logp_b = kde(bert_b)
cls_id_ordered = sorted(range(len(logp_b)), key=lambda i:logp_b[i], reverse=False)

# print(kde)
# print("logp_b=",logp_b)
# print("logp_b.len=",len(logp_b))
# print("cls_id_ordered=",cls_id_ordered)

for percent in [5, 10, 20]:
    print(percent)

    num_to_select = int(np.sum(count_a)*0.01*(100-percent))
    

    cur_index = 0
    selected_sample = []
    num_per_ds = defaultdict(int)
    
    while len(selected_sample) < num_to_select:
        cls_id = cls_id_ordered[cur_index]
        cur_index += 1
        
        dsname = datasetB[cls_id]["dataset"]
        verb = datasetB[cls_id]["verb"]
        all_indices = idx_by_verb[dsname][verb]
        
        np.random.shuffle(all_indices)
        num = min(datasetB[cls_id]["count"], num_to_select-len(selected_sample))
        num_per_ds[dsname] += num
        idx = all_indices[:num]
        selected_sample += [[dsname, x] for x in idx]
    
    
    # path = f"video_property/selection/Egtea_reduction_{percent}p.json"
    path = f"./reduction/Egtea_reduction_{percent}p_reverse.json"

    with open(path, "w") as fp:
        json.dump(selected_sample, fp)
    print(len(selected_sample), [(k,v) for k,v in num_per_ds.items()])

106
5
7885 [('EgteaGaze', 7885)]
10
7470 [('EgteaGaze', 7470)]
20
6640 [('EgteaGaze', 6640)]
