# Multimodal German DistilBERT

In [None]:
!pip install torch
!pip install tensorboard
!pip install keras tensorflow
!pip install transformers == 3.1.0
!pip install multimodal-transformers
!pip install --no-cache-dir tensorflow-gpu

In [1]:
import torch
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
from multimodal_transformers.data import load_data
from transformers import AutoTokenizer, AutoConfig
from multimodal_transformers.model import AutoModelWithTabular, TabularConfig

In [2]:
from dataclasses import dataclass, field
import json
import logging
import os
from typing import Optional

import numpy as np
from transformers import (
    AutoTokenizer,
    AutoConfig,
    Trainer,
    EvalPrediction,
    set_seed
)
from transformers.training_args import TrainingArguments

from multimodal_transformers.data import load_data_from_folder
from multimodal_transformers.model import TabularConfig
from multimodal_transformers.model import AutoModelWithTabular

logging.basicConfig(level=logging.INFO)
os.environ['COMET_MODE'] = 'DISABLED'
import transformers

In [3]:
import gc
torch.cuda.is_available()
gc.collect()
torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
df_given = pd.read_csv('df_given_labels_200k.tsv',
                    sep='\t')
df_predicted  = pd.read_csv('test_overwritten_labels_600k.tsv',
                    sep='\t')
df_predicted = df_predicted[["text","income_group","employment_group","marital_status","nationality","age_decade",
                            "avg_res_realtor","feedback_realtor"]]
df_predicted= df_predicted[df_predicted['text'] != "None"]
df = pd.concat([df_given, df_predicted], axis=0)

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
df['feedback_realtor'] = labelencoder.fit_transform(df['feedback_realtor'])
df.head(3)

In [None]:
df['feedback_realtor'] = pd.Categorical(df.feedback_realtor)
df['nationality'] = pd.Categorical(df.nationality)
df['marital_status'] = pd.Categorical(df.marital_status)
df['income_group'] = pd.Categorical(df.income_group)
df['employment_group'] = pd.Categorical(df.employment_group)
df['age_decade'] = pd.Categorical(df.age_decade)

In [None]:
train_df, val_df, test_df = np.split(df.sample(frac=1), [int(.8*len(df)), int(.9 * len(df))])
print('Num examples train-val-test')
print(len(train_df), len(val_df), len(test_df))
train_df.to_csv('./Datasets/train.csv')
val_df.to_csv('./Datasets/val.csv')
test_df.to_csv('./Datasets/test.csv')

In [4]:
@dataclass
class ModelArguments:
  """
  Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
  """

  model_name_or_path: str = field(
      metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
  )
  config_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
  )
  tokenizer_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
  )
  cache_dir: Optional[str] = field(
      default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
  )

In [5]:
@dataclass
class MultimodalDataTrainingArguments:
  """
  Arguments pertaining to how we combine tabular features
  Using `HfArgumentParser` we can turn this class
  into argparse arguments to be able to specify them on
  the command line.
  """

  data_path: str = field(metadata={
                            'help': 'the path to the csv file containing the dataset'
                        })
  column_info_path: str = field(
      default=None,
      metadata={
          'help': 'the path to the json file detailing which columns are text, categorical, numerical, and the label'
  })

  column_info: dict = field(
      default=None,
      metadata={
          'help': 'a dict referencing the text, categorical, numerical, and label columns'
                  'its keys are text_cols, num_cols, cat_cols, and label_col'
  })

  categorical_encode_type: str = field(default='ohe',
                                        metadata={
                                            'help': 'sklearn encoder to use for categorical data',
                                            'choices': ['ohe', 'binary', 'label', 'none']
                                        })
  numerical_transformer_method: str = field(default='yeo_johnson',
                                            metadata={
                                                'help': 'sklearn numerical transformer to preprocess numerical data',
                                                'choices': ['yeo_johnson', 'box_cox', 'quantile_normal', 'none']
                                            })
  task: str = field(default="classification",
                    metadata={
                        "help": "The downstream training task",
                        "choices": ["classification", "regression"]
                    })

  mlp_division: int = field(default=4,
                            metadata={
                                'help': 'the ratio of the number of '
                                        'hidden dims in a current layer to the next MLP layer'
                            })
  combine_feat_method: str = field(default='individual_mlps_on_cat_and_numerical_feats_then_concat',
                                    metadata={
                                        'help': 'method to combine categorical and numerical features, '
                                                'see README for all the method'
                                    })
  mlp_dropout: float = field(default=0.1,
                              metadata={
                                'help': 'dropout ratio used for MLP layers'
                              })
  numerical_bn: bool = field(default=True,
                              metadata={
                                  'help': 'whether to use batchnorm on numerical features'
                              })
  use_simple_classifier: str = field(default=True,
                                      metadata={
                                          'help': 'whether to use single layer or MLP as final classifier'
                                      })
  mlp_act: str = field(default='relu',
                        metadata={
                            'help': 'the activation function to use for finetuning layers',
                            'choices': ['relu', 'prelu', 'sigmoid', 'tanh', 'linear']
                        })
  gating_beta: float = field(default=0.2,
                              metadata={
                                  'help': "the beta hyperparameters used for gating tabular data "
                                          "see https://www.aclweb.org/anthology/2020.acl-main.214.pdf"
                              })

  def __post_init__(self):
      assert self.column_info != self.column_info_path
      if self.column_info is None and self.column_info_path:
          with open(self.column_info_path, 'r') as f:
              self.column_info = json.load(f)

In [6]:
text_cols = ['text']
cat_cols = ['nationality', 'marital_status', 'income_group','employment_group','age_decade']
num_cols = ['avg_res_realtor']

In [7]:
column_info_dict = {
    'text_cols': text_cols,
    'cat_cols': cat_cols,
    'num_cols' : num_cols,
    'label_col': 'feedback_realtor',
    'label_list': ['0','1']
}

data_args = MultimodalDataTrainingArguments(
    data_path='./Datasets/',
    combine_feat_method='weighted_feature_sum_on_transformer_cat_and_numerical_feats',
    column_info=column_info_dict,
    task='classification'
)

model_args = ModelArguments(
    model_name_or_path='distilbert-base-german-cased'
)


data_args = MultimodalDataTrainingArguments(
    data_path='./Datasets/',
    combine_feat_method='weighted_feature_sum_on_transformer_cat_and_numerical_feats',
    column_info=column_info_dict,
    task='classification'
)

training_args = TrainingArguments(
    output_dir="./logs/model_name/mmbt_2epoch_64/",
    logging_dir="./logs/runs/mmbt_2epoch_64/",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=64,
    num_train_epochs=2,
    evaluate_during_training=True,
    logging_steps= 10000,
    save_steps=15000,
    eval_steps=104235,
    save_total_limit = 4
)

set_seed(training_args.seed)

In [8]:
tokenizer_path_or_name = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
print('Specified tokenizer: ', tokenizer_path_or_name)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_path_or_name,
    cache_dir=model_args.cache_dir,
)

Specified tokenizer:  distilbert-base-german-cased


INFO:filelock:Lock 140092409603520 acquired on /home/ec2-user/.cache/torch/transformers/91dc05ca5527bfe4665fbe48c61a27f70a2f95564557a0553110430301b6a4e3.9e9be6c7ed31b75b3c39946d76205fe34192bbd3b7a97e205ebf28fa13825de2.lock


Downloading:   0%|          | 0.00/464 [00:00<?, ?B/s]

INFO:filelock:Lock 140092409603520 released on /home/ec2-user/.cache/torch/transformers/91dc05ca5527bfe4665fbe48c61a27f70a2f95564557a0553110430301b6a4e3.9e9be6c7ed31b75b3c39946d76205fe34192bbd3b7a97e205ebf28fa13825de2.lock
INFO:filelock:Lock 140095794614224 acquired on /home/ec2-user/.cache/torch/transformers/221b1368d82380622e22df082e9bfdafece5b7d5c42a0cd73b57d89247d2f28d.bac90776f6fa34759f05c7387e9124c4d626300e981b5786b82f674e08f99d72.lock


Downloading:   0%|          | 0.00/240k [00:00<?, ?B/s]

INFO:filelock:Lock 140095794614224 released on /home/ec2-user/.cache/torch/transformers/221b1368d82380622e22df082e9bfdafece5b7d5c42a0cd73b57d89247d2f28d.bac90776f6fa34759f05c7387e9124c4d626300e981b5786b82f674e08f99d72.lock


In [9]:
# Get Datasets
train_dataset, val_dataset, test_dataset = load_data_from_folder(
    data_args.data_path,
    data_args.column_info['text_cols'],
    tokenizer,
    label_col=data_args.column_info['label_col'],
    label_list=data_args.column_info['label_list'],
    categorical_cols=data_args.column_info['cat_cols'],
    numerical_cols = data_args.column_info['num_cols'],
    numerical_transformer_method = 'quantile_normal',
    max_token_length = 200
)

INFO:multimodal_transformers.data.data_utils:1 numerical columns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
INFO:multimodal_transformers.data.data_utils:35 categorical columns
INFO:multimodal_transformers.data.data_utils:1 numerical columns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
INFO:multimodal_transformers.data.load_data:Text columns: ['text']
INFO:multimodal_transformers.data.load_data:Raw text example: Sehr geehrter Herr Römer , Mein derzeitiger Mitbewohner m 24 und ich m 23 wohnen zur Zeit in Düsseldo

In [10]:
num_labels = 2

In [11]:
config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
tabular_config = TabularConfig(num_labels=num_labels,
                               cat_feat_dim=train_dataset.cat_feats.shape[1],
                               numerical_feat_dim=train_dataset.numerical_feats.shape[1],
                               **vars(data_args))
config.tabular_config = tabular_config

In [12]:
model = AutoModelWithTabular.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir
    )

INFO:filelock:Lock 140092292705472 acquired on /home/ec2-user/.cache/torch/transformers/5d61179239b4209e055881d93e51e5a69f1901f76f964d7bc92c9a62a7fc5026.59e237bd040884abec825b1267b5b13223e1400bfcef5a4a3fee71c4fc21a989.lock


Downloading:   0%|          | 0.00/270M [00:00<?, ?B/s]

INFO:filelock:Lock 140092292705472 released on /home/ec2-user/.cache/torch/transformers/5d61179239b4209e055881d93e51e5a69f1901f76f964d7bc92c9a62a7fc5026.59e237bd040884abec825b1267b5b13223e1400bfcef5a4a3fee71c4fc21a989.lock
Some weights of the model checkpoint at distilbert-base-german-cased were not used when initializing DistilBertWithTabular: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertWithTabular from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertWithTabular from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilB

In [13]:
import numpy as np
from scipy.special import softmax
from sklearn.metrics import (
    auc,
    precision_recall_curve,
    roc_auc_score,
    f1_score,
    confusion_matrix,
    matthews_corrcoef,
)

In [14]:
def calc_classification_metrics(p: EvalPrediction):
  pred_labels = np.argmax(p.predictions, axis=1)
  pred_scores = softmax(p.predictions, axis=1)[:, 1]
  labels = p.label_ids
  if len(np.unique(labels)) == 2:  # binary classification
      roc_auc_pred_score = roc_auc_score(labels, pred_scores)
      precisions, recalls, thresholds = precision_recall_curve(labels,
                                                                pred_scores)
      fscore = (2 * precisions * recalls) / (precisions + recalls)
      fscore[np.isnan(fscore)] = 0
      ix = np.argmax(fscore)
      threshold = thresholds[ix].item()
      pr_auc = auc(recalls, precisions)
      tn, fp, fn, tp = confusion_matrix(labels, pred_labels, labels=[0, 1]).ravel()
      result = {'roc_auc': roc_auc_pred_score,
                'threshold': threshold,
                'pr_auc': pr_auc,
                'recall': recalls[ix].item(),
                'precision': precisions[ix].item(), 'f1': fscore[ix].item(),
                'tn': tn.item(), 'fp': fp.item(), 'fn': fn.item(), 'tp': tp.item()
                }
  else:
      acc = (pred_labels == labels).mean()
      f1 = f1_score(y_true=labels, y_pred=pred_labels)
      result = {
          "acc": acc,
          "f1": f1,
          "acc_and_f1": (acc + f1) / 2,
          "mcc": matthews_corrcoef(labels, pred_labels)
      }

  return result

In [15]:
trainer_distilbert = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=calc_classification_metrics
)

In [None]:
%%time
trainer_distilbert.train()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9773 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9773 [00:00<?, ?it/s]

{'loss': 0.61130146484375, 'learning_rate': 2.4419318530645658e-05, 'epoch': 1.0232272587741738, 'step': 10000}




CPU times: user 7h 37min 40s, sys: 2h 17min 5s, total: 9h 54min 46s
Wall time: 9h 54min 59s


TrainOutput(global_step=19546, training_loss=0.593411355744142)

In [17]:
from transformers.configuration_utils import PretrainedConfig

In [18]:
model.save_pretrained('./logs/model_name/saved_model_2epochs_64batches/')