### run.py

In [79]:

import argparse
import train
import test
import evaluate
import os
import json

if __name__ == '__main__':
    
    #? IDK what to use these for. Were mentioned in run_experiment.sh
    GPU_IDX=0
    CUDA_DEVICE_ORDER="PCI_BUS_ID"
    CUDA_VISIBLE_DEVICES=GPU_IDX
    
    
    #? setup arg parser to pass in config info
    parser = argparse.ArgumentParser()
    parser.add_argument('--logdir', type=str, default='./log/', help='path to model save dir')
    parser.add_argument('--loglevel', type=str, default='info', help='set level of logger')
    parser.add_argument('--identifier', type=str, default='debug', help='unique run identifier')
    parser.add_argument('--config', type=str, default='./configs/dfaust/config_dfaust.yaml', help='path to yaml config file')
    parser.add_argument('--model_ckpt', type=str, default='000000.pt', help='checkpoint to load')
    parser.add_argument('--fix_random_seed', action='store_true', default=False, help='fix random seed')
    args = parser.parse_args([])


    #? Manually set arg parser values here:
    #? Comment out if you want to use terminal flags above instead (I just got tired of manually entering them in)
    
    args.logdir = './log/'
    args.loglevel = 'debug'
    args.identifier = f'config_msr_action3d_temporal_1'
    args.config = f'configs\msr-action3d\{args.identifier}.yaml'
    args.model_ckpt = '000000.pt'
    args.fix_random_seed = True
    print('args', args)


args Namespace(config='configs\\msr-action3d\\config_msr_action3d_temporal_1.yaml', fix_random_seed=True, identifier='config_msr_action3d_temporal_1', logdir='./log/', loglevel='debug', model_ckpt='000000.pt')


### train

In [80]:

# Author: Yizhak Ben-Shabat (Itzik), 2022
# train 3DInAction

import os
import yaml
import argparse
import i3d_utils as utils
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
import json

from models.pointnet import feature_transform_regularizer
from models import build_model
from datasets import build_dataloader

#import wandb
from tqdm import tqdm

import logging

def create_basic_logger(logdir, level = 'info'):
    print(f'Using logging level {level} for train.py')
    global logger
    logger = logging.getLogger('train_logger')
    
    #? set logging level
    if level.lower() == 'debug':
        logger.setLevel(logging.DEBUG)
    elif level.lower() == 'info':
        logger.setLevel(logging.INFO)
    elif level.lower() == 'warning':
        logger.setLevel(logging.WARNING)
    elif level.lower() == 'error':
        logger.setLevel(logging.ERROR)
    elif level.lower() == 'critical':
        logger.setLevel(logging.CRITICAL)
    else:
        logger.setLevel(logging.INFO)
    
    #? create handlers
    file_handler = logging.FileHandler(os.path.join(logdir, "log_train.log"))
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    
    stream_handler = logging.StreamHandler()
    #stream_handler.setLevel(logging.INFO)
    #stream_handler.setFormatter(stream_handler)
    logger.addHandler(stream_handler)
    return logger

In [81]:
cfg = yaml.safe_load(open(args.config))
logdir = os.path.join(args.logdir, args.identifier)
os.makedirs(logdir, exist_ok=True)

logger = create_basic_logger(logdir = logdir, level = args.loglevel)

# TODO: move to cfg project_name, entity
if cfg['DATA'].get('name') == 'DFAUST':
    project_name = 'DFAUST'
elif cfg['DATA'].get('name') == 'IKEA_EGO':
    project_name = 'IKEA EGO'
elif cfg['DATA'].get('name') == 'IKEA_ASM':
    project_name = 'IKEA ASM'
elif cfg['DATA'].get('name') == 'MSR-Action3D':
    project_name = 'MSR-Action3D'
else:
    raise NotImplementedError

logger.info(f'=================== Starting training run for {args.identifier} with data {project_name}')
logger.info(cfg)


#wandb_run = wandb.init(project=project_name, entity='mkjohn', save_code=True)
#cfg['WANDB'] = {'id': wandb_run.id, 'project': wandb_run.project, 'entity': wandb_run.entity}

with open(os.path.join(logdir, 'config.yaml'), 'w') as outfile:
    yaml.dump(cfg, outfile, default_flow_style=False)
    
logger.info(f'saving outputs for this run too: {logdir}')

#wandb_run.name = args.identifier
#wandb.config.update(cfg)  # adds all the arguments as config variables
#wandb.run.log_code(".")
# define our custom x axis metric
#wandb.define_metric("train/step")
#wandb.define_metric("train/*", step_metric="train/step")
#wandb.define_metric("test/*", step_metric="train/step")

# need to add argparse
#run(cfg, logdir, args)



{'DATA': {'data_sampler': 'weighted', 'dataset_path': 'datasets\\data\\MSRAction3D_fps_new\\train', 'dataset_path_test': 'datasets\\data\\MSRAction3D_fps_new\\test', 'frame_skip': 1, 'frames_per_clip': 64, 'gender': 'all', 'n_points': 128, 'name': 'MSR-Action3D', 'noisy_data': {'test': False, 'train': False}, 'shuffle_points': 'fps_each_frame'}, 'MODEL': {'3DMFV': {'n_gaussians': 8}, 'P4TRANSFORMER': {'depth': 10, 'dim': 1024, 'dim_head': 256, 'dropout1': 0.05, 'dropout2': 0.5, 'emb_relu': False, 'heads': 8, 'mlp_dim': 2048, 'nsamples': 32, 'radius': 0.1, 'spatial_stride': 32, 'temporal_kernel_size': 3, 'temporal_stride': 2}, 'PSTNET': {'nsamples': 9, 'radius': 0.1}, 'PST_TRANSFORMER': {'depth': 5, 'dim': 1024, 'dim_head': 256, 'dropout1': 0.0, 'dropout2': 0.5, 'heads': 8, 'mlp_dim': 2048, 'nsamples': 32, 'radius': 0.1, 'spatial_stride': 32, 'temporal_kernel_size': 3, 'temporal_stride': 2}, 'SET_TRANSFORMER': {'dim_hidden': 1024, 'dim_input': 3, 'ln': False, 'num_heads': 8, 'num_inds':

Using logging level debug for train.py


In [82]:
n_epochs = cfg['TRAINING']['n_epochs']
lr = cfg['TRAINING']['lr']
batch_size = cfg['TRAINING']['batch_size']
refine, refine_epoch = cfg['TRAINING']['refine'], cfg['TRAINING']['refine_epoch']
pretrained_model = cfg['TRAINING']['pretrained_model']
pc_model = cfg['MODEL']['pc_model']
frames_per_clip = cfg['DATA']['frames_per_clip']
num_steps_per_update = cfg['TRAINING']['steps_per_update']
save_every = cfg['save_every']

if args.fix_random_seed:
    seed = cfg['seed']
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

back_file = os.path.join(logdir, 'train.py')
models_backup_path = os.path.join(logdir, 'models')
os.makedirs(models_backup_path, exist_ok=True)

__file__ = r'F:\Classes\COMPSCI 674 - Intelligent Visual Computing\Project\CS-674-Final-Project-3dInAction\train.ipynb'

if os.name == 'nt':
    os.system(f'copy "{__file__}" "{back_file}"') # backup the current training file
    for file in os.listdir('models'):
        if file.endswith('.py'):
            current_file = os.path.join('models', file)
            new_file=os.path.join(models_backup_path, file)
            os.system(f'copy "{current_file}" "{new_file}"')
else:
    os.system(f'cp "{__file__}" "{back_file}"') # backup the current training file
    os.system(f'cp "models/*.py" "{models_backup_path}"')  # backup the models files
    
logger.debug(f'backed up the current training file: {(__file__, back_file)}')
logger.debug(f'backup the models files: models/*.py, {models_backup_path}')

# build dataloader and dataset
train_dataloader, train_dataset = build_dataloader(config=cfg, training=True, shuffle=False, logger=logger) # should be unshuffled because of sampler
test_dataloader, test_dataset = build_dataloader(config=cfg, training=False, shuffle=True, logger=logger)
num_classes = train_dataset.num_classes

# build model
model = build_model(cfg['MODEL'], num_classes, frames_per_clip)

if pretrained_model is not None:
    logger.info('Loading pretrained model')
    checkpoints = torch.load(pretrained_model)
    model.load_state_dict(checkpoints["model_state_dict"])  # load trained model
    model.replace_logits(num_classes)

if refine:
    if refine_epoch == 0:
        raise ValueError("You set the refine epoch to 0. No need to refine, just retrain.")
    logger.info('Refining model')
    refine_model_filename = os.path.join(logdir, str(refine_epoch).zfill(6)+'.pt')
    checkpoint = torch.load(refine_model_filename)
    model.load_state_dict(checkpoint["model_state_dict"])

model.cuda()
model = nn.DataParallel(model)
#best_model_trained_yet_and_its_accuracy = [None, 51.25]

# define optimizer and scheduler
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1E-6)
lr_sched = optim.lr_scheduler.StepLR(optimizer, step_size=25, gamma=0.5)

if refine:
    lr_sched.load_state_dict(checkpoint["lr_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])



backed up the current training file: ('F:\\Classes\\COMPSCI 674 - Intelligent Visual Computing\\Project\\CS-674-Final-Project-3dInAction\\train.ipynb', './log/config_msr_action3d_temporal_1\\train.py')
backed up the current training file: ('F:\\Classes\\COMPSCI 674 - Intelligent Visual Computing\\Project\\CS-674-Final-Project-3dInAction\\train.ipynb', './log/config_msr_action3d_temporal_1\\train.py')
backed up the current training file: ('F:\\Classes\\COMPSCI 674 - Intelligent Visual Computing\\Project\\CS-674-Final-Project-3dInAction\\train.ipynb', './log/config_msr_action3d_temporal_1\\train.py')
backed up the current training file: ('F:\\Classes\\COMPSCI 674 - Intelligent Visual Computing\\Project\\CS-674-Final-Project-3dInAction\\train.ipynb', './log/config_msr_action3d_temporal_1\\train.py')
backed up the current training file: ('F:\\Classes\\COMPSCI 674 - Intelligent Visual Computing\\Project\\CS-674-Final-Project-3dInAction\\train.ipynb', './log/config_msr_action3d_temporal_1\\t

In [83]:
steps = 0
n_examples = 0
train_num_batch = len(train_dataloader)
test_num_batch = len(test_dataloader)
refine_flag = True

train_log_dict = {}
test_log_dict = {}

pbar = tqdm(total=n_epochs, desc='Training', dynamic_ncols=True)

if refine:
    pbar.update(refine_epoch)

best_acc = 0
best_results = None

train_log_dict = {}
test_log_dict = {}
train_result_list = []
test_result_list = []
best_model_list = []

while steps <= n_epochs:
    if steps <= refine_epoch and refine and refine_flag:
        # lr_sched.step()
        steps += 1
        n_examples += len(train_dataset.clip_set)
        continue
    else:
        refine_flag = False
    # Each epoch has a training and
    
    test_batchind = -1
    test_fraction_done = 0.0
    test_enum = enumerate(test_dataloader, 0)
    tot_loss = 0.0
    tot_loc_loss = 0.0
    tot_cls_loss = 0.0
    num_iter = 0
    optimizer.zero_grad()

    # Iterate over data.
    avg_acc = []
        
    print('START TRAIN FOR EPOCH') 
    
    for train_batchind, data in enumerate(train_dataloader):
        num_iter += 1
        print(train_batchind, num_iter)
        
        train_fraction_done = (train_batchind + 1) / train_num_batch
        if num_iter == num_steps_per_update or train_batchind == len(train_dataloader)-1:
            print(f'----- num_iter ({num_iter}) == num_steps_per_update ({num_steps_per_update}) or train_batchind ({train_batchind}) == len(train_dataloader)-1 ({len(train_dataloader)-1})')
            n_steps = num_steps_per_update
            print(f'----- n_steps ({n_steps})')
            if train_batchind == len(train_dataloader)-1:
                print(f'-----++++++ train_batchind ({train_batchind}) == len(train_dataloader)-1 ({len(train_dataloader)-1})')
                n_steps = num_iter
                
            if test_fraction_done <= train_fraction_done and test_batchind + 1 < test_num_batch:
                print(f'>>>>> test_fraction_done ({test_fraction_done}) <= train_fraction_done ({train_fraction_done}) and test_batchind ({test_batchind + 1}) + 1 < test_num_batch ({test_num_batch})')
                test_batchind, data = next(test_enum)
                print(f'>>>>>+++++++ test_batchind ({test_batchind})')
                print(f'>>>>>+++++++ data ({len(data)})')
        
    print('FINISH TRAIN FOR EPOCH') 
    #print(steps, n_epochs)
    steps += 1
    

Training:   0%|          | 0/800 [00:28<?, ?it/s]


START TRAIN FOR EPOCH
0 1
----- num_iter (1) == num_steps_per_update (32) or train_batchind (0) == len(train_dataloader)-1 (0)
----- n_steps (32)
-----++++++ train_batchind (0) == len(train_dataloader)-1 (0)
>>>>> test_fraction_done (0.0) <= train_fraction_done (1.0) and test_batchind (0) + 1 < test_num_batch (26)
>>>>>+++++++ test_batchind (0)
>>>>>+++++++ data (4)
FINISH TRAIN FOR EPOCH
START TRAIN FOR EPOCH
0 1
----- num_iter (1) == num_steps_per_update (32) or train_batchind (0) == len(train_dataloader)-1 (0)
----- n_steps (32)
-----++++++ train_batchind (0) == len(train_dataloader)-1 (0)
>>>>> test_fraction_done (0.0) <= train_fraction_done (1.0) and test_batchind (0) + 1 < test_num_batch (26)
>>>>>+++++++ test_batchind (0)
>>>>>+++++++ data (4)
FINISH TRAIN FOR EPOCH
START TRAIN FOR EPOCH
0 1
----- num_iter (1) == num_steps_per_update (32) or train_batchind (0) == len(train_dataloader)-1 (0)
----- n_steps (32)
-----++++++ train_batchind (0) == len(train_dataloader)-1 (0)
>>>>> te