In [37]:
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from scipy.special import softmax
from os.path import join
from torch import cuda

import numpy as np
import torch as th
import pandas as pd
import sys
import emoji_resources as er

In [41]:
try:
    lang = sys.argv[1]
    if lang == "-f":
        print("running in notebook, setting testing defaults")
        lang = "hi"
        emoji = "🤌"
        test = True
    elif lang in er.languages:
        try:
            emoji = int(sys.argv[2])
        except IndexError:
            print("no emoji supplied!")
        try:
            test = int(sys.argv[3])
            if test == "test":
                test = True
        except IndexError:
            test = False
    else:
        print(f"unknown language")
        
except IndexError:
    print("no language supplied!")

running in notebook, setting testing defaults


In [46]:
class InferenceDataset(Dataset):

    def __init__(self, data, tokenizer, max_token_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        text = data_row.text
        encoding = self.tokenizer.encode_plus(
            text=text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=True,
            padding="max_length",
            truncation=True,
            return_attention_mask=True)

        return dict(input_ids=th.tensor(encoding["input_ids"], dtype=th.long),
                    attention_mask=th.tensor(encoding["attention_mask"], dtype=th.long),
                    token_type_ids=th.tensor(encoding["token_type_ids"], dtype=th.long))

In [48]:
checkpoint = "models/twitter-xlm-roberta-base-sentiment" 
model = AutoModelForSequenceClassification\
        .from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#config = AutoConfig.from_pretrained(checkpoint)

loading configuration file models/twitter-xlm-roberta-base-sentiment/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "/home/jupyter/misc/tweeteval/TweetEval_models/xlm-twitter/local-twitter-xlm-roberta-base-sentiment/",
  "architectures": [
    "XLMRobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Negative",
    "1": "Neutral",
    "2": "Positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Negative": 0,
    "Neutral": 1,
    "Positive": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": 

In [49]:
src = "../data/tweets"
file = f"tweets_language-{lang}_emoji-{emoji}_2019-01-01-to-2021-11-28.parquet.gzip"

if test:
    df = pd.read_parquet(join(src, file))
    df = df[0:10]
else:
    df = pd.read_parquet(join(src, file))

df = df.dropna(subset=['text'])

if test:
    batch_size = 10
else:
    batch_size = 4096

inference_set = InferenceDataset(df, tokenizer, max_token_len=128)
inference_params = {'batch_size': batch_size, 'shuffle': False}
inference_loader = DataLoader(inference_set, **inference_params)

In [53]:
training_args = TrainingArguments(
    "test-trainer",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    learning_rate = 2e-5,
    weight_decay = 0.01,
    evaluation_strategy = "epoch"
)

trainer = Trainer(
        model,
        training_args,
        tokenizer = tokenizer,
)

device = 'cuda' if cuda.is_available() else 'cpu'
print(f"running on device: {device}")

raw_pred, _, _ = trainer.prediction_loop(inference_loader, description="prediction")
scores = softmax(raw_pred)

df['negative'] = scores[0:, 0]
df['neutral'] = scores[0:, 1]
df['positive'] = scores[0:, 2]


if test:
    print(df[['negative', 'neutral', 'positive',]])
else:
    dst = "../data/sentiment"
    resname = f"sentiment_language-{lang}_emoji-{emoji}_2019-01-01-to-2021-11-28.csv.gzip"
    df[["id", "negative", "neutral", "positive"]]\
        .to_csv(join(dst, resname), index=False, compression="gzip")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running prediction *****
  Num examples = 10
  Batch size = 10


running on device: cpu


   negative   neutral  positive
0  0.032637  0.028273  0.036788
1  0.039159  0.029295  0.027809
2  0.028249  0.039377  0.030254
3  0.030703  0.031591  0.034855
4  0.033584  0.030449  0.031835
5  0.062768  0.034475  0.016023
6  0.041121  0.031654  0.023742
7  0.065914  0.026992  0.015719
8  0.036633  0.029535  0.028600
9  0.050708  0.029019  0.022240
