In [1]:
import config
from model import MSQL
from dataset import CustomDataset, collate_fn
from utils import read_data, read_tables

import os 
import datetime
from shutil import copyfile
from time import time 

import torch
from torch.utils.data import DataLoader
import transformers

from torch.utils.tensorboard import SummaryWriter


os.environ['CUDA_VISIBLE_DEVICES'] = config.CUDA_VISIBLE_DEVICES      # specify GPU usage  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.empty_cache()
transformers.logging.set_verbosity_error()

In [2]:
### Load data
print('Loading data...')
train_tables = read_tables(config.train_table_file)
train_data = read_data(config.train_data_file, train_tables)
val_tables = read_tables(config.val_table_file)
val_data = read_data(config.val_data_file, val_tables)
print(f'{len(train_data)} samples and {len(train_tables)} tables in the training set')
print(f'{len(val_data)} samples and {len(val_tables)} tables in the validation set')
print('Loading finished.')

### Create dataset
print('Creating dataset...')
train_set = CustomDataset(train_data)
val_set = CustomDataset(val_data)
train_dataloader = DataLoader(
    dataset=train_set, 
    batch_size=config.BATCH_SIZE, 
    shuffle=True, 
    num_workers=config.NUM_WORKERS,
    pin_memory=True,
    collate_fn=collate_fn
)
val_dataloader = DataLoader(
    dataset=val_set, 
    batch_size=config.BATCH_SIZE, 
    shuffle=False, 
    num_workers=config.NUM_WORKERS,
    pin_memory=True,
    collate_fn=collate_fn
)

Loading data...
41522 samples and 5013 tables in the training set
4396 samples and 1197 tables in the validation set
Loading finished.
Creating dataset...


In [9]:
S_num = {0:0, 1:0, 2:0}
for sample in train_set:
    label = sample['S_num'].item()
    S_num[label]+=1
S_num

{0: 36912, 1: 4607, 2: 3}

In [8]:
W_num_op = dict()
for sample in train_set:
    label = sample['W_num_op'].item()
    if label in W_num_op.keys():
        W_num_op[label]+=1
    else:
        W_num_op[label]=1
W_num_op

{1: 8557, 2: 15542, 0: 16429, 4: 754, 3: 201, 6: 36, 5: 3}

In [10]:
config.W_num_op_id2label, config.S_num_id2label

({0: 'NULL-1',
  1: 'OR-2',
  2: 'AND-2',
  3: 'OR-3',
  4: 'AND-3',
  5: 'OR-4',
  6: 'AND-4'},
 {0: 1, 1: 2, 2: 3})

In [18]:
new_train_data = []

for idx, sample in enumerate(train_data):
    W_num_op_label = sample.sql.conn_sql_dict[sample.sql.cond_conn_op] + '-' + str(len(sample.sql.conds)) 
    S_num_label = len(sample.sql.sel)
    if S_num_label!=2 and W_num_op_label!='OR-4' and W_num_op_label!='AND-4' :
        new_train_data.append(sample)

print(len(new_train_data))

36876


In [9]:
train_data[0].sql

In [3]:
import torch
x = torch.tensor([1, 2, 3])
x.repeat(4,1)

tensor([[1, 2, 3],
        [1, 2, 3],
        [1, 2, 3],
        [1, 2, 3]])

In [2]:
import torch

a = torch.randn(4, 4)

torch.sum(a)

tensor(0.3024)

In [2]:
def mask_acc(logits, target, mask):
    """
    Args:
        logits: (batch, max_headers, num_classes),  not yet passed to Softmax 
        target: (batch, max_headers) 
        mask  : (batch_size, max_headers)

    """
    logits_softmax = torch.softmax(logits, dim = -1)        # (batch, max_len, num_classes)
    # print(logits_softmax)
    _, y_pred_tags = torch.max(logits_softmax, dim = -1)    # (batch, max_len)
    # print(y_pred_tags)
    correct_pred = (y_pred_tags == target).float()          # (batch, max_len)
    correct_pred = correct_pred * mask.float()              # (batch, max_len)
    acc = correct_pred.sum() / torch.sum(mask)          
    return acc


In [3]:
import torch
logits = [[[0.1, 0.2, 0.9], [0.1, 0.2, 0.9],[0.1, 0.2, 0.9]],[[0.1, 0.2, 0.9], [0.1, 0.2, 0.9],[0.1, 0.2, 0.9]]]
logits = torch.tensor(logits)
print(logits)
print(logits.shape)

tensor([[[0.1000, 0.2000, 0.9000],
         [0.1000, 0.2000, 0.9000],
         [0.1000, 0.2000, 0.9000]],

        [[0.1000, 0.2000, 0.9000],
         [0.1000, 0.2000, 0.9000],
         [0.1000, 0.2000, 0.9000]]])
torch.Size([2, 3, 3])


In [4]:
# pred = [[2,2,2],[2,2,2]]

target = [[2,0,2],[2,0,2]]
target = torch.tensor(target)
print(target)
print(target.shape)

tensor([[2, 0, 2],
        [2, 0, 2]])
torch.Size([2, 3])


In [5]:
mask = [[1,1,0],[0,0,0]]
mask = torch.tensor(mask)
print(mask)
print(mask.shape)

tensor([[1, 1, 0],
        [0, 0, 0]])
torch.Size([2, 3])


In [6]:
mask_acc(logits, target, mask)

tensor(0.5000)

In [7]:

def masked_ce_loss(logits, target, mask):
    """
    Args:
        logits: (batch, max_len, num_classes),  not yet passed to Softmax 
        target: (batch, max_len) 
        mask:   (batch_size, max_len)
    Returns:
        loss: An average loss value masked by the length.
    """

    logits_softmax = torch.softmax(logits, dim = -1)        # (batch, max_len, num_classes)
    logits_log_softmax = torch.log(logits_softmax)          # (batch, max_len, num_classes)
    target = torch.nn.functional.one_hot(target)            # (batch, max_len, num_classes)
    multi = -torch.multiply(logits_log_softmax, target)     # (batch, max_len, num_classes)
    sum_multi = multi.sum(-1)                               # (batch, max_len)
    sum_multi_masked = sum_multi * mask                     # (batch, max_len)
    sum_mask = mask.sum()                                   # (1,)
    sum_multi_masked = sum_multi_masked.sum()               # (1,)
    
    return sum_multi_masked/(sum_mask + 1e-9)
    
masked_ce_loss(logits, target, mask)

tensor(1.0657)

In [18]:
import psutil
def checkIfProcessRunning(processName):
    '''
    Check if there is any running process that contains the given name processName.
    '''
    #Iterate over the all the running process
    for proc in psutil.process_iter():
        try:
            for cmd in proc.cmdline():
            # Check if process name contains the given name string.
                if processName in cmd:
                    return True
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            pass
    return False

In [20]:

checkIfProcessRunning('ccccc')


False