In [None]:
!pip install torch, sklearn
!pip install tensorboard
!pip install keras tensorflow
!pip install transformers == 3.1.0
!pip install multimodal-transformers
!pip install --no-cache-dir tensorflow-gpu

## Load model

In [1]:
import os
import random
import pandas as pd
import numpy as np
from dataclasses import dataclass, field
import json
import logging
import os
from typing import Optional
import tensorflow as tf
import seaborn as sns
from datetime import datetime
from tensorflow import keras
from transformers.training_args import TrainingArguments
from transformers import AutoTokenizer, AutoConfig
from multimodal_transformers.data import load_data_from_folder
from multimodal_transformers.model import AutoModelWithTabular, TabularConfig
from transformers import AutoConfig
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

## Load test data

In [11]:
text_cols = ['text']
cat_cols = ['nationality', 'marital_status', 'income_group','employment_group','age_decade']
num_cols = ['avg_res_realtor']

column_info = {
    'text_cols': text_cols,
    'num_cols': num_cols,
    'cat_cols': cat_cols,
    'label_col': 'feedback_realtor',
    'label_list': ['dismiss','reply']
}

tokenizer = AutoTokenizer.from_pretrained(
    'distilbert-base-german-cased'
)


In [None]:
train_dataset, val_dataset, test_dataset = load_data_from_folder(
    '../MMBT/Datasets/',
    column_info['text_cols'],
    tokenizer,
    label_col=column_info['label_col'],
    label_list=column_info['label_list'],
    categorical_cols=column_info['cat_cols'],
    numerical_cols=column_info['num_cols'],
    #sep_text_token_str=',',
)

In [13]:
print('train_dataset: ', train_dataset.df.shape)
print('val_dataset: ', val_dataset.df.shape)
print('test_dataset: ', test_dataset.df.shape)
del train_dataset
del val_dataset
# del test_dataset
# print(train_dataset.df.size)


train_dataset:  (625410, 38)
val_dataset:  (78176, 38)
test_dataset:  (78177, 38)


## Load the Multimodal DistilBERT and Predict from test

In [14]:
from multimodal_transformers.model import (
    DistilBertWithTabular,
)

In [15]:
@dataclass
class ModelArguments:
  """
  Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
  """

  model_name_or_path: str = field(
      metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
  )
  config_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
  )
  tokenizer_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
  )
  cache_dir: Optional[str] = field(
      default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
  )
    
    
@dataclass
class MultimodalDataTrainingArguments:
  """
  Arguments pertaining to how we combine tabular features
  Using `HfArgumentParser` we can turn this class
  into argparse arguments to be able to specify them on
  the command line.
  """

  data_path: str = field(metadata={
                            'help': 'the path to the csv file containing the dataset'
                        })
  column_info_path: str = field(
      default=None,
      metadata={
          'help': 'the path to the json file detailing which columns are text, categorical, numerical, and the label'
  })

  column_info: dict = field(
      default=None,
      metadata={
          'help': 'a dict referencing the text, categorical, numerical, and label columns'
                  'its keys are text_cols, num_cols, cat_cols, and label_col'
  })

  categorical_encode_type: str = field(default='ohe',
                                        metadata={
                                            'help': 'sklearn encoder to use for categorical data',
                                            'choices': ['ohe', 'binary', 'label', 'none']
                                        })
  numerical_transformer_method: str = field(default='yeo_johnson',
                                            metadata={
                                                'help': 'sklearn numerical transformer to preprocess numerical data',
                                                'choices': ['yeo_johnson', 'box_cox', 'quantile_normal', 'none']
                                            })
  task: str = field(default="classification",
                    metadata={
                        "help": "The downstream training task",
                        "choices": ["classification", "regression"]
                    })

  mlp_division: int = field(default=4,
                            metadata={
                                'help': 'the ratio of the number of '
                                        'hidden dims in a current layer to the next MLP layer'
                            })
  combine_feat_method: str = field(default='individual_mlps_on_cat_and_numerical_feats_then_concat',
                                    metadata={
                                        'help': 'method to combine categorical and numerical features, '
                                                'see README for all the method'
                                    })
  mlp_dropout: float = field(default=0.1,
                              metadata={
                                'help': 'dropout ratio used for MLP layers'
                              })
  numerical_bn: bool = field(default=True,
                              metadata={
                                  'help': 'whether to use batchnorm on numerical features'
                              })
  use_simple_classifier: str = field(default=True,
                                      metadata={
                                          'help': 'whether to use single layer or MLP as final classifier'
                                      })
  mlp_act: str = field(default='relu',
                        metadata={
                            'help': 'the activation function to use for finetuning layers',
                            'choices': ['relu', 'prelu', 'sigmoid', 'tanh', 'linear']
                        })
  gating_beta: float = field(default=0.2,
                              metadata={
                                  'help': "the beta hyperparameters used for gating tabular data "
                                          "see https://www.aclweb.org/anthology/2020.acl-main.214.pdf"
                              })

  def __post_init__(self):
      assert self.column_info != self.column_info_path
      if self.column_info is None and self.column_info_path:
          with open(self.column_info_path, 'r') as f:
              self.column_info = json.load(f)

In [16]:
column_info_dict = {
    'text_cols': text_cols,
    'num_cols': num_cols,
    'cat_cols': cat_cols,
    'label_col': 'feedback_realtor',
    'label_list': ['dismiss','reply']
}

model_args = ModelArguments(
    model_name_or_path='../MMBT/logs/model_name/saved_model_200seq_weights_2epochs/',
    tokenizer_name ='distilbert-base-german-cased'
)

tokenizer_path_or_name = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
print('Specified tokenizer: ', tokenizer_path_or_name)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_path_or_name,
    cache_dir=model_args.cache_dir,
)

num_labels = len(np.unique(test_dataset.labels))

config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
tabular_config = TabularConfig(num_labels=num_labels,
                               cat_feat_dim=test_dataset.cat_feats.shape[1],
                               numerical_feat_dim=test_dataset.numerical_feats.shape[1])
config.tabular_config = tabular_config


data_args = MultimodalDataTrainingArguments(
    data_path='../MMBT/Datasets/',
    combine_feat_method='weighted_feature_sum_on_transformer_cat_and_numerical_feats',
    column_info=column_info_dict,
    task='classification'
)

training_args = TrainingArguments(
    output_dir="./logs/model_name/mmbt_attention_2epoch/",
    logging_dir="./logs/runs/mmbt_attention_2epoch/",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=32,
    num_train_epochs=2,
    evaluate_during_training=True,
    logging_steps= 10000,
    save_steps=1500,
    eval_steps=10000,
    save_total_limit = 3
)

# set_seed(training_args.seed)



model = AutoModelWithTabular.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        config=config,
#         from_tf=True,
        cache_dir=model_args.cache_dir
    )

Specified tokenizer:  distilbert-base-german-cased


Some weights of the model checkpoint at ../MMBT/logs/model_name/saved_model_200seq_weights_2epochs/ were not used when initializing DistilBertWithTabular: ['tabular_combiner.weight_cat', 'tabular_combiner.weight_num', 'tabular_combiner.cat_layer.weight', 'tabular_combiner.cat_layer.bias', 'tabular_combiner.num_layer.weight', 'tabular_combiner.num_layer.bias', 'tabular_combiner.layer_norm.weight', 'tabular_combiner.layer_norm.bias']
- This IS expected if you are initializing DistilBertWithTabular from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertWithTabular from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
import torch

In [18]:
test_dataset.df.shape

(78177, 38)

In [19]:
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-german-cased")

def create_model_inputs(df, length, start=0):
    texts = [p for p in df.df.text[start:length]]
    model_inputs = tokenizer(texts, padding=True,truncation=True,max_length=200)
    for i in model_inputs.data.keys():
        model_inputs[i] = torch.LongTensor(model_inputs[i])
        
    numerical_feats = torch.from_numpy(df.numerical_feats[start:length]).type(torch.FloatTensor)
    categorical_feat = torch.tensor(df.cat_feats[start:length,:])
    labels = torch.tensor(df.labels[start:length])
    
    model_inputs['cat_feats'] = categorical_feat
    model_inputs['num_feats'] = numerical_feats
    model_inputs['labels'] = labels
    return model_inputs

In [20]:
# added extra
import torch
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
from torch.autograd import Variable
from tqdm import tqdm
n_inputs = test_dataset.df.shape[0]

prediction_batches = []
try:
    del loss
except:
    print("no loss")
try:
    del logits, logits_n
except:
    print("no logits")
try:
    del layer_outs
except:
    print("no layer_outs")
prefix = '../MMBT/Datasets/'
test_df = pd.read_csv(prefix + 'test_for_db.tsv', sep='\t')
batches_size = 50

for i in tqdm(range((n_inputs//batches_size)+1)):
    model_inputs = create_model_inputs(test_dataset, (i+1)*batches_size, i*batches_size )
    _, logits, _= model(
        model_inputs['input_ids'],
        labels=model_inputs['labels'],
        cat_feats=model_inputs['cat_feats'],
        numerical_feats=model_inputs['num_feats']
    )
    logits_n = Variable(logits, requires_grad=True)
    predictions = tf.nn.softmax(logits_n.detach().numpy())
    predictions = pd.DataFrame(data=np.concatenate([x.numpy() for x in predictions]).reshape(-1, 2),
                          columns=['y_0', 'y_1'],
                          index=test_dataset.df.index[i*batches_size:(i+1)*batches_size].to_numpy())

    predictions.to_csv('./prediction_weighted_feats_2epochs/predictions_'+str(i)+'.csv', sep =';')
    
print(type(prediction_batches[0]))

no loss
no logits
no layer_outs


100%|██████████| 1564/1564 [3:09:21<00:00,  7.26s/it]  


IndexError: list index out of range

In [None]:
predictions = pd.DataFrame(data=np.concatenate([x.numpy() for x in prediction_batches]).reshape(-1, 2),
                          columns=['y_0', 'y_1'],
                          index=test_dataset.df.index[:np.sum(prediction_batches)].to_numpy())

predictions.to_csv('./predictions_mmbt.csv', sep =';')