In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer

ModuleNotFoundError: No module named 'torch'

In [3]:
class Dataset:
    def __init__(self, text, tokenizer, max_len):
        self.text = text
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }

In [5]:
def generate_predictions(model_path, max_len):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv("../input/learn-ai-bbc/BBC News Test.csv")
    
    dataset = Dataset(text=df.Text.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=4, pin_memory=True, shuffle=False
    )

    final_output = []

    for b_idx, data in enumerate(data_loader):
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output = model(**data)
            output = output.logits.detach().cpu().numpy()
            final_output.extend(output)
    
    torch.cuda.empty_cache()
    return np.vstack(final_output)

In [6]:
preds = generate_predictions("../input/autonlp-bbc-news/", max_len=128)

NameError: name 'AutoModelForSequenceClassification' is not defined

In [7]:
id_category_mapping = {
  0: "business",
  1: "entertainment",
  2: "politics",
  3: "sport",
  4: "tech"
}

In [8]:
sub = pd.read_csv("../input/learn-ai-bbc/BBC News Sample Solution.csv")
sub["Category"] = np.argmax(preds, axis=1)
sub["Category"] = sub["Category"].map(id_category_mapping)
sub.to_csv("submission.csv", index=False)

NameError: name 'pd' is not defined

In [10]:
sub.head()

NameError: name 'sub' is not defined

In [9]:
id_category_mapping

{0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'}