# Submit for AU

## imports & load data

In [1]:
import os
import pickle
from tqdm import tqdm
from copy import deepcopy

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset

In [2]:
data_dir = 'fea_notebooks/features_newvf2.pickle'
test_dir = '/home/HDD6TB/datasets/emotions/ABAW/ABAW_5/VA_AU_FER/test_set/CVPR_5th_ABAW_AU_test_set_sample.txt'
cropped_data = '/home/avsavchenko/src/emotions-multimodal/faces/ABAW/abaw5/enet_b0_8_best_vgaf_cropped.pickle'

In [3]:
with open(data_dir, 'rb') as handle:
    data=pickle.load(handle)
print(len(data))

2941546


In [4]:
print(f"Torch: {torch.__version__}")
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

Torch: 2.0.1+cu118
cuda:0


## load model & dataloader

In [5]:
from tcn import TemporalConvNet
from trans_encoder import TransEncoder


class Model(nn.Module):
    def __init__(self, modality=['frames', 'w2v2large', 'openl3', 'w2v2hub'],
                 embedding_dim={'frames': 1280, 'w2v2large': 1024, 'openl3': 512,
                                'w2v2hub': 256},
                 tcn_channel={
                     'frames': [1280, 512, 256, 128],
                     'w2v2large': [1024, 512, 256, 128],
                     'openl3': [512, 256, 128],
                     'w2v2hub': [256, 128]
    }):
        super(Model, self).__init__()
        self.modality = modality

        self.temporal, self.fusion = nn.ModuleDict(), None

        for modal in self.modality:
            self.temporal[modal] = TemporalConvNet(num_inputs=embedding_dim[modal],
                                                   num_channels=tcn_channel[modal], dropout=0.3, attention=False)

        conv_dim = 0
        for m in self.modality:
            conv_dim += tcn_channel[m][-1]
            
        self.encoder = TransEncoder(
            inc=conv_dim, outc=256, dropout=0.3, nheads=4, 
            nlayer=8)
            
        self.head = nn.Sequential(
            nn.Linear(256, 256//2),
            nn.BatchNorm1d(256//2),
            nn.Linear(256//2, 12),
        )

    def forward(self, x):
        
        bs, seq_len, _ = x[self.modality[0]].shape
#         print(bs, seq_len)
        for m in self.modality:
            x[m] = x[m].transpose(1, 2)
            x[m] = self.temporal[m](x[m])

        feat_list = []
        for m in self.modality:
            feat_list.append(x[m])
        out = torch.cat(feat_list, dim=1)
        out = self.encoder(out)

        out = torch.transpose(out, 1, 0)
        out = torch.reshape(out, (bs*seq_len, -1))
#         print(out.shape)

        out = self.head(out)
        return F.sigmoid(out)

model = Model(modality=['frames'], embedding_dim={'frames': 1280},
              tcn_channel={
                     'frames': [1280, 512, 256, 128]})
model.to(device);
model.load_state_dict(torch.load('fau_ovid_52.pt'))

<All keys matched successfully>

In [6]:
from math import ceil


class audioDataset(Dataset):
    def __init__(self, names, values, window=300, step=200):
        self.data = values
        self.lenghts_of_seq = len(self.data[0])
#         self.y = y
        
        self.names = names
        self.window = window
        self.step = step
        
        self.len = ceil((self.lenghts_of_seq - self.window) / self.step) + 1
        
    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        if idx == self.len-1:
            stride = [torch.tensor(i[-self.window::]) for i in self.data]
#             y = self.y[-self.window::]
        
        else:
            stride = [torch.tensor(i[idx*self.step:idx*self.step + self.window]) for i in self.data]
#             y = self.y[idx*self.step:idx*self.step + self.window]
            
        r_dict = dict([(i, j.to(device)) for i, j in zip(self.names, stride)])
        
        return r_dict, len(self.data[0])

## Find threshold

In [7]:
DATA_DIR = '/home/HDD6TB/datasets/emotions/ABAW/ABAW_6/6th_ABAW_Annotations'

In [8]:
def predict(data, model):
    pred_labels_val = []

    a_dataset = audioDataset(['frames'],
                     data, window=300, step=300)
    training_loader = DataLoader(a_dataset, batch_size=64, shuffle=False)

    model.eval()
    with torch.no_grad():
        for i, vdata in enumerate(training_loader):
            vinputs, d_len = vdata
            voutputs = model(vinputs)
            pred_labels_val += voutputs.tolist()
    
    d_len = d_len[0]

    s = 300 - len(pred_labels_val) + d_len
    return pred_labels_val[:-300]+pred_labels_val[-s:]

In [9]:
data_dir=os.path.join(DATA_DIR,'faces')
dirpath=os.path.join(DATA_DIR,'AU_Detection_Challenge/Validation_Set')
test_videos={}
for filename in tqdm(os.listdir(dirpath)):
    fn, ext = os.path.splitext(os.path.basename(filename))
    if ext.lower()=='.txt':
        X,indices,expressions=[],[],[]
        w2v2large_t, openl3_t, w2v2hub_t = [], [], []
        with open(os.path.join(dirpath,filename)) as f:
            lines = f.read().splitlines()
            prev_val=None
            for i,line in enumerate(lines):
                if i>0:
                    splitted_line=line.split(',')
                    aus=list(map(int,splitted_line))
                    if min(aus)>=0:
                        imagename=fn+'/'+str(i).zfill(5)+'.jpg'
                        if imagename in data:
                            X.append(data[imagename]['frame'][0])
#                             w2v2large_t.append(data[imagename]['w2v2large']) 
#                             openl3_t.append(data[imagename]['openl3']) 
#                             w2v2hub_t.append(data[imagename]['w2v2hub'])

                            indices.append(i)
                            expressions.append(aus)
                        
        test_videos[fn]=(predict([X], model),indices,np.array(expressions))
print(len(test_videos))

  stride = [torch.tensor(i[idx*self.step:idx*self.step + self.window]) for i in self.data]
100%|██████████| 105/105 [00:59<00:00,  1.77it/s]

105





In [10]:
thresh = np.array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5])

In [11]:
for i in tqdm(range(12)):
    best_j, best_f1 = 0, 0
    for j in np.linspace(0,1,11):
        thresh[i] = j
        predicts = []
        y_true = []
        for filename in os.listdir(dirpath):
            fn, ext = os.path.splitext(os.path.basename(filename))
            predicts+=(np.array(test_videos[fn][0]) >= thresh[None,:]).tolist()
            y_true += test_videos[fn][2].tolist()
            
        f1 = f1_score(y_true,
                        predicts,
                        average='macro')
        
        if f1 > best_f1:
            best_f1 = f1
            best_j = j
            
    thresh[i] = best_j 

100%|██████████| 12/12 [07:24<00:00, 37.07s/it]


In [13]:
thresh[thresh <= 0.1] = 0.2
thresh

array([0.4, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.2, 0.3, 0.4, 0.2])

In [14]:
predicts = []
y_true = []
for filename in os.listdir(dirpath):
    fn, ext = os.path.splitext(os.path.basename(filename))
    predicts+=(np.array(test_videos[fn][0]) >= thresh[None,:]).tolist()
    y_true += test_videos[fn][2].tolist()
    
f1_score(y_true,
         predicts,
         average='macro')

0.5495948264673347

## Smoothing on validation

In [15]:
deltas=[0,3,5,7,10,15,20,25,50]
total_true=[]
total_preds=[[] for _ in range(len(deltas))]
for videoname,(y_pred_aus,indices,y_true) in test_videos.items():
    for i,ind in enumerate(indices):
        if min(y_true[i])>=0:
            total_true.append(y_true[i])
    cur_ind=0
    preds=[]
    for i in range(indices[-1]):
        if indices[cur_ind]-1==i:
            preds.append(y_pred_aus[cur_ind])
            cur_ind+=1
        else:
            if cur_ind==0:
                preds.append(y_pred_aus[cur_ind])
            else:
                w=(i-indices[cur_ind-1]+1)/(indices[cur_ind]-indices[cur_ind-1])
                y_pred_aus = np.array(y_pred_aus)
                pred=w*y_pred_aus[cur_ind-1]+(1-w)*y_pred_aus[cur_ind]
                preds.append(pred)
    
    preds=np.array(preds)
    for hInd,delta in enumerate(deltas):
        cur_preds=[]
        for i in range(len(preds)):
            i1=max(i-delta,0)
            pred=np.mean(preds[i1:i+delta+1],axis=0)
            aus=(pred>=thresh)*1
            #aus=(pred>=0.5)*1
            cur_preds.append(aus)
        for i,ind in enumerate(indices):
            if min(y_true[i])>=0:
                total_preds[hInd].append(cur_preds[ind-1])
    

total_true=np.array(total_true)
for hInd,delta in enumerate(deltas):
    preds=np.array(total_preds[hInd])
    f1scores=[f1_score(y_true=total_true[:,i],y_pred=preds[:,i]) for i in range(preds.shape[1])]
    print(delta,np.mean(f1scores),f1scores)

0 0.5495948264673347 [0.581114237110704, 0.4666320863327556, 0.6041993147228347, 0.6352552654421376, 0.7495249497490196, 0.7534327962096743, 0.7448095188107877, 0.30390911129381865, 0.21175821577744502, 0.2488874054294615, 0.8423092968775072, 0.45330571985187007]
3 0.5506883034396232 [0.582698033570003, 0.46840849139555335, 0.6043185365179083, 0.6373722511631607, 0.7497643164079627, 0.7534411035542563, 0.7460587655867706, 0.3091184400044318, 0.20933788074797127, 0.2503534107603182, 0.8416181457546869, 0.4557702658124555]
5 0.5504932707588317 [0.5828912412637429, 0.46990330076709724, 0.6043566329507241, 0.6384761451469765, 0.7493180182719116, 0.7522121246567391, 0.7464266185847466, 0.30894428152492664, 0.20498593310409502, 0.2517351030001383, 0.8404647145505083, 0.45620513528437323]
7 0.550203256585771 [0.5833292497672368, 0.4714679651206121, 0.6044538429406852, 0.639046700783703, 0.7490246434621058, 0.7510542734609397, 0.7461824073448998, 0.3103122730573711, 0.20026597825236644, 0.2524

## test predict

In [16]:
test_set = []

def get_names(dirname):
    
    names = []
    with open(os.path.join(dirname)) as f:
        lines = f.read().splitlines()
        
        for i,line in enumerate(lines):
            if i>0:
                name = line[:-1]
                names.append(name)
                
    print(len(names))
    return names

test_set = get_names(test_dir)

729736


In [17]:
keys = data.keys()
c = 0
missed = []

for k in test_set:
    if k in keys:
        c+=1
        
    else:
        missed.append(k)
        
c

718153

In [18]:
with open(os.path.join(test_dir),'r') as f:
    test_set_sample=f.read().splitlines()
print(len(test_set_sample),test_set_sample[:5])

test_set_videos={}
for s in test_set_sample[1:]:
    videoname,img_name=s[:-1].split('/')
    if videoname not in test_set_videos:
        test_set_videos[videoname]=[]
    test_set_videos[videoname].append(img_name)
    
print(len(test_set_videos))

729737 ['image_location,AU1,AU2,AU4,AU6,AU7,AU10,AU12,AU15,AU23,AU24,AU25,AU26', '2-30-640x360/00001.jpg,', '2-30-640x360/00002.jpg,', '2-30-640x360/00003.jpg,', '2-30-640x360/00004.jpg,']
141


In [19]:
datasets = {}
for k in tqdm(test_set_videos.keys()):
    w2v2large_t, openl3_t, w2v2hub_t = [], [], []
    X_t = []
    for images in test_set_videos[k]:
        key = f'{k}/{images}'
        if key in missed: continue
        X_t.append(data[key]['frame'][0])
#         w2v2large_t.append(data[key]['w2v2large'])
#         openl3_t.append(data[key]['openl3'])
#         w2v2hub_t.append(data[key]['w2v2hub'])
    
    a_dataset = audioDataset(['frames'],
                     [X_t], window=300, step=300)
    training_loader = DataLoader(a_dataset, batch_size=64, shuffle=False)
    datasets[k] = training_loader

100%|██████████| 141/141 [01:07<00:00,  2.09it/s]


In [20]:
def predict(loader, model):
    pred_labels_val = []
#     predicts = []

    model.eval()
    with torch.no_grad():
        for i, vdata in enumerate(loader):
            vinputs, d_len = vdata
            voutputs = model(vinputs)
            pred_labels_val += voutputs.data.tolist()
            
#             _, predicted = torch.max(voutputs.data, 1)
#             predicts += predicted.tolist()
    
    d_len = d_len[0]
    s = 300 - len(pred_labels_val) + d_len
    return pred_labels_val[:-300]+pred_labels_val[-s:]

In [21]:
test_videos={}
test_videos_num_frames={}
for videoname,img_files in tqdm(test_set_videos.items()):
    X,indices,filenames,scores=[],[],[],[]
    num_present=num_missed=0
    for img_name in img_files:
        k=videoname+'/'+img_name
        if k in data:
            indices.append(int(img_name[:-4]))
#             print()
            filenames.append(k)
            num_present+=1
        else:
            num_missed+=1
    test_videos[videoname]=(predict(datasets[videoname], model),indices,filenames)
    test_videos_num_frames[videoname]=(num_present,num_missed)
    
#     del datasets[videoname]
#     torch.cuda.empty_cache()
    
print(len(test_videos))

100%|██████████| 141/141 [01:29<00:00,  1.57it/s]

141





In [22]:
def write_au_results(res_filename):
    with open(os.path.join(res_filename), 'w') as f:
        f.write(test_set_sample[0]+'\n')
        for videoname,(y_pred_au,indices,filenames) in test_videos.items():
            cur_ind=0
            preds=[]
            for i in range(indices[-1]):
                if indices[cur_ind]-1==i:
                    preds.append(y_pred_au[cur_ind])
                    cur_ind+=1
                else:
                    if cur_ind==0:
                        preds.append(y_pred_au[cur_ind])
                    else:
                        w=(i-indices[cur_ind-1]+1)/(indices[cur_ind]-indices[cur_ind-1])
                        y_pred_au = np.array(y_pred_au)
                        pred=w*y_pred_au[cur_ind-1]+(1-w)*y_pred_au[cur_ind]
                        preds.append(pred)

            pred=y_pred_au[cur_ind-1]
            for _ in range(indices[-1],len(test_set_videos[videoname])):
                preds.append(pred)

            preds=np.array(preds)
            for i,img_name in enumerate(test_set_videos[videoname]):
                i1=max(i-delta,0)
                pred=np.mean(preds[i1:i+delta+1],axis=0)
                aus=(pred>=thresh)*1
                f.write(videoname+'/'+img_name+','+','.join(map(str,aus))+'\n')
delta = 3
res_filename='au_predictions/predictions_au_tcn_only_video.txt'
write_au_results(res_filename)