In [1]:
import torch

import numpy as np
import torch
import os
import argparse

from torch.utils.data import DataLoader, Dataset, Subset

from utils import seed_torch, seed_worker
from train import create_parser
from models import *
from modules import *

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
parser = create_parser()
# Be careful to use parser.parse_args([]) instead of parser.parse_args(). Otherwise it will prompt error. 
# The issue lies in JupyterNotebook
# See answer in https://stackoverflow.com/questions/50360012/python-argparse-error-error-argument-count-invalid-int-value for more details
args_1 = parser.parse_args([])
args_2 = parser.parse_args([])
args_3 = parser.parse_args([])
args_4 = parser.parse_args([])
args_enc = parser.parse_args([])
args_dec = parser.parse_args([])

for args in [args_1, args_2, args_3, args_4, args_enc, args_dec]:
        args.dim_hidden = 1024
        args.LR_pos_weight = 0.2442
        args.dim_in = 4099
        args.dim_out = 583
        args.batch_size = 256
        args.device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

### 1. Module Parameters

In [6]:
args_enc.dim_hidden = 448
args_dec.dim_hidden = 448

In [7]:
enc_rnn = EncoderRNN(args_enc)
dec_rnn_LR = AttnDecoderRNN(args_dec)
dec_rnn_noLR = AttnDecoderRNN(args_dec)

num_total_enc_rnn = sum([p.numel() for p in enc_rnn.parameters()])
num_total_dec_rnn_LR = sum([p.numel() for p in dec_rnn_LR.parameters()])
num_total_dec_rnn_noLR = sum([p.numel() for p in dec_rnn_noLR.parameters()])

num_trainable_enc_rnn = sum([p.numel() for p in enc_rnn.parameters() if p.requires_grad])
num_trainable_dec_rnn_LR = sum([p.numel() for p in dec_rnn_LR.parameters() if p.requires_grad])
num_trainable_dec_rnn_noLR = sum([p.numel() for p in dec_rnn_noLR.parameters() if p.requires_grad])

In [8]:
num_total_enc_rnn, num_total_dec_rnn_LR, num_total_dec_rnn_noLR

(4251520, 4007535, 4007535)

In [10]:
num_trainable_enc_rnn, num_trainable_dec_rnn_LR, num_trainable_dec_rnn_noLR

(4251520, 4007535, 4007535)

In [11]:
enc_trans = EncoderTrans(args_enc)
dec_trans_LR = DecoderTrans(args_dec)
dec_trans_noLR = DecoderTrans(args_dec)

num_total_enc_trans = sum([p.numel() for p in enc_trans.parameters()])
num_total_dec_trans_LR = sum([p.numel() for p in dec_trans_LR.parameters()])
num_total_dec_trans_noLR = sum([p.numel() for p in dec_trans_noLR.parameters()])

num_trainable_enc_trans = sum([p.numel() for p in enc_trans.parameters() if p.requires_grad])
num_trainable_dec_trans_LR = sum([p.numel() for p in dec_trans_LR.parameters() if p.requires_grad])
num_trainable_dec_trans_noLR = sum([p.numel() for p in dec_trans_noLR.parameters() if p.requires_grad])

In [12]:
num_total_enc_trans, num_total_dec_trans_LR, num_total_dec_trans_noLR

(9769408, 11414143, 11414143)

### 2. Model Parameters

In [13]:
args_1.dim_hidden, args_2.dim_hidden, args_3.dim_hidden, args_4.dim_hidden

(1024, 1024, 1024, 1024)

In [14]:
seq2seq_no_fact = Seq2SeqNoFact(args_1)
seq2seq_fact = Seq2SeqFact(args_2)
seq2seq_naive_2enc = Seq2SeqFactNaive_2enc(args_3)
seq2seq_naive = Seq2SeqFactNaive(args_4)

In [15]:
# Initialize models
seq2seq_no_fact = Seq2SeqNoFact(args_1)
seq2seq_fact = Seq2SeqFact(args_2)
seq2seq_naive_2enc = Seq2SeqFactNaive_2enc(args_3)
seq2seq_naive = Seq2SeqFactNaive(args_4)

# Find number of parameters of each model 
num_total_seq2seq_no_fact = sum([p.numel() for p in seq2seq_no_fact.parameters()])
num_total_seq2seq_fact = sum([p.numel() for p in seq2seq_fact.parameters()])
num_total_seq2seq_naive_2enc = sum([p.numel() for p in seq2seq_naive_2enc.parameters()])
num_total_seq2seq_naive = sum([p.numel() for p in seq2seq_naive.parameters()])

# Find number of parameters of each model
num_trainable_seq2seq_no_fact = sum([p.numel() for p in seq2seq_no_fact.parameters() if p.requires_grad])
num_trainable_seq2seq_fact = sum([p.numel() for p in seq2seq_fact.parameters() if p.requires_grad])
num_trainable_seq2seq_naive_2enc = sum([p.numel() for p in seq2seq_naive_2enc.parameters() if p.requires_grad])
num_trainable_seq2seq_naive = sum([p.numel() for p in seq2seq_naive.parameters() if p.requires_grad])

In [16]:
num_total_seq2seq_no_fact, num_total_seq2seq_fact, num_total_seq2seq_naive_2enc, num_total_seq2seq_naive

(30972463, 59326094, 76121741, 92917389)

In [17]:
num_trainable_seq2seq_no_fact, num_trainable_seq2seq_fact, num_trainable_seq2seq_naive_2enc, num_trainable_seq2seq_naive

(30972463, 59326094, 76121741, 92917389)

In [18]:
# Initialize models
trans_no_fact = TransNoFact(args_1)
trans_fact = TransFact(args_2)
trans_naive = TransFactNaive(args_3)

# Find number of parameters of each model 
num_total_trans_no_fact = sum([p.numel() for p in trans_no_fact.parameters()])
num_total_trans_fact = sum([p.numel() for p in trans_fact.parameters()])
num_total_trans_naive = sum([p.numel() for p in trans_naive.parameters()])

# Find number of parameters of each model
num_trainable_trans_no_fact = sum([p.numel() for p in trans_no_fact.parameters() if p.requires_grad])
num_trainable_trans_fact = sum([p.numel() for p in trans_fact.parameters() if p.requires_grad])
num_trainable_trans_naive = sum([p.numel() for p in trans_naive.parameters() if p.requires_grad])

In [19]:
num_total_trans_no_fact, num_total_trans_fact, num_total_trans_naive

(69788735, 150566078, 209366205)

### 3. Tune hidden dimension of non-factorized model to match the level of parameters

In [50]:
args_1.dim_hidden = 1536

In [52]:
# Initialize models
new_seq2seq_no_fact = Seq2SeqNoFact(args_1)

# Find number of parameters of each model 
new_num_total_seq2seq_no_fact = sum([p.numel() for p in new_seq2seq_no_fact.parameters()])

# Find number of parameters of each model
new_num_trainable_seq2seq_no_fact = sum([p.numel() for p in new_seq2seq_no_fact.parameters() if p.requires_grad])

In [53]:
new_num_total_seq2seq_no_fact, num_total_seq2seq_no_fact, num_total_seq2seq_fact, num_total_seq2seq_naive_2enc, num_total_seq2seq_naive

(63417391, 30972463, 59326094, 76121741, 92917389)

In [44]:
args_1.dim_hidden = 1664

In [45]:
# Initialize models
new_trans_no_fact = TransNoFact(args_1)

# Find number of parameters of each model 
new_num_total_trans_no_fact = sum([p.numel() for p in new_trans_no_fact.parameters()])

# Find number of parameters of each model
new_num_trainable_trans_no_fact = sum([p.numel() for p in new_trans_no_fact.parameters() if p.requires_grad])

In [46]:
new_num_total_trans_no_fact, num_total_trans_no_fact, num_total_trans_fact, num_total_trans_naive

(152589375, 69788735, 150566078, 209366205)