In [1]:
import pandas as pd
import torch

from tqdm import tqdm
from transformers import pipeline

In [None]:
if torch.cuda.is_available(): device = torch.device("cuda")
elif torch.backends.mps.is_available(): device = torch.device("mps")
else: device = torch.device("cpu")

In [2]:
train = pd.read_csv("data/processed/train.csv")
test = pd.read_csv("data/processed/test.csv")

# Baseline: zero-shot classification with RoBERTa

In [15]:
test = test.sample(frac=0.01, random_state=42) 

In [16]:
classifier = pipeline("zero-shot-classification",
                     model="roberta-large-mnli",
                     framework="pt", # Using PyTorch to avoid conflicts with Keras
                     device=device)

labels = ["positive", "negative"]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps


In [17]:
tqdm.pandas()

test["RoBERTa_base"] = test["text"].progress_apply(
    lambda x: classifier(x, candidate_labels=labels)["labels"][0]
)

100%|██████████| 4/4 [00:00<00:00, 12.65it/s]


In [19]:
test[['id', 'RoBERTa_base']].to_csv("output/RoBERTa_base.csv")