#### Method1 - AutoNLP based tagging
##### Author : Kunal Kalwankar

In [None]:
#Below code is inspired by implementations of huggingfaces models on kaggle

# Import 
import torch
import pandas as pd
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
# Import dataset
data = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv')

In [None]:
# Dataset class to configure the model
class Dataset:
    def __init__(self, text, tokenizer, max_len):
        self.text = text
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }

In [None]:
# Function to generate predictions
def generate_predictions(model_path, max_len):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv')
    
    dataset = Dataset(text=df.article.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=4, pin_memory=True, shuffle=False
    )

    final_output = []

    for b_idx, data in enumerate(data_loader):
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output = model(**data)
            output = output.logits.detach().cpu().numpy()
            final_output.extend(output)
    
    torch.cuda.empty_cache()
    return np.vstack(final_output)
preds = generate_predictions("abhishek/autonlp-bbc-news-classification-37229289", max_len=128)

In [None]:
# specifying mapping as 
id_category_mapping = {
  0: "business",
  1: "entertainment",
  2: "politics",
  3: "sport",
  4: "tech"
}

In [None]:
# predict and save as a result file
sub = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv")
sub["Category"] = np.argmax(preds, axis=1)
sub["Category"] = sub["Category"].map(id_category_mapping)
sub.to_csv("submission.csv", index=False)