In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fibe-hack-to-vibe-2-0-iab-dataset/sample_submission.csv
/kaggle/input/fibe-hack-to-vibe-2-0-iab-dataset/train.csv
/kaggle/input/fibe-hack-to-vibe-2-0-iab-dataset/test.csv
/kaggle/input/fibe-hack-to-vibe-2-0-iab-dataset/cats


In [2]:
infer = pd.read_csv("/kaggle/input/fibe-hack-to-vibe-2-0-iab-dataset/test.csv")

In [3]:
import torch

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
device

device(type='cuda')

In [6]:
import pickle

In [7]:
with open("/kaggle/input/fibe-hack-to-vibe-2-0-iab-dataset/cats","rb") as fp:
    cats = pickle.load(fp)

In [8]:
cats

['academic interests',
 'books and literature',
 'healthy living',
 'careers',
 'news and politics',
 'shopping',
 'style and fashion',
 'family and relationships',
 'business and finance',
 'automotives',
 'pharmaceuticals, conditions, and symptoms',
 'arts and culture',
 'sports',
 'pets',
 'hobbies and interests',
 'real estate',
 'food and drinks',
 'home and garden',
 'video gaming',
 'movies',
 'travel',
 'personal finance',
 'technology and computing',
 'music and audio',
 'television',
 'health']

In [9]:
infer_text = infer['text']

In [10]:
model_name = "LakshitKava/Fibe_IAB_DB_v2"
tokenizer_name = "distilbert-base-uncased"

In [11]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

In [12]:
model = DistilBertForSequenceClassification.from_pretrained("LakshitKava/FibeVibeToHack2.0-IAB-DistillBert")

config.json:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [13]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased", num_labels=26)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



In [14]:
model.device

device(type='cpu')

In [16]:
model_gpu = model.to(device)

In [18]:
model_gpu.device

device(type='cuda', index=0)

In [20]:
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        # Tokenize the text when it's retrieved
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}  # Squeeze to remove batch dim
        return item

    def __len__(self):
        return len(self.texts)

In [21]:
infer_dataset = NewsDataset(infer_text, tokenizer)

In [22]:
infer_loader = torch.utils.data.DataLoader(infer_dataset, batch_size=32, shuffle=False)


In [23]:
from tqdm import tqdm


In [26]:
model.eval()
predictions = []
with torch.no_grad(): 
    for batch in tqdm(infer_loader, desc="Inference Progress"):
        batch_gpu = {key: val.to('cuda') for key, val in batch.items()}
        
        # Run model inference
        outputs = model_gpu(**batch_gpu) ## GPU
        logits = outputs.logits

        # Get predictions (taking the argmax)
        batch_predictions = torch.argmax(logits, dim=1)
        predictions.extend(batch_predictions.cpu().numpy())  # Move to CPU and store in list


Inference Progress: 100%|██████████| 5450/5450 [51:51<00:00,  1.75it/s]


In [27]:
len(predictions)

174382

In [28]:
predictions_series = pd.Series(predictions, name="predictions")

In [29]:
infer["predictions"] = predictions_series

In [30]:
infer.head(5)

Unnamed: 0.1,Unnamed: 0,text,Word Count,Index,predictions
0,0,"equl offers enzyme assay kits, reagent mixture...",353,Article_0,0
1,1,gauthmath: instant math questions solver for f...,112,Article_1,0
2,2,Whats the No. 1 cause of blindness in older ad...,340,Article_2,25
3,3,Surfers will ride a wave in the Amazon this we...,465,Article_3,0
4,4,"Why is the top of a leaf the most colorful, so...",269,Article_4,0


In [32]:
infer['target'] = infer.predictions.apply( lambda x:  cats[x] )

In [33]:
infer.head(5)

Unnamed: 0.1,Unnamed: 0,text,Word Count,Index,predictions,target
0,0,"equl offers enzyme assay kits, reagent mixture...",353,Article_0,0,academic interests
1,1,gauthmath: instant math questions solver for f...,112,Article_1,0,academic interests
2,2,Whats the No. 1 cause of blindness in older ad...,340,Article_2,25,health
3,3,Surfers will ride a wave in the Amazon this we...,465,Article_3,0,academic interests
4,4,"Why is the top of a leaf the most colorful, so...",269,Article_4,0,academic interests


In [34]:
infer.to_csv("distillbert.csv", columns=["Index","target"], index=False)