In [1]:
!pip install pandas numpy transformers torch scikit-learn kaggle

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import json
from tqdm import tqdm
import torch
from transformers import AutoTokenizer
import ast

In [3]:
with open('arxivData.json', 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)

In [4]:
df = pd.DataFrame(data)

def parse_stringified_list(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else x
    except:
        return []

for col in ['author', 'tag', 'link']:
    df[col] = df[col].apply(parse_stringified_list)

def get_primary_category(tags):
    if tags and isinstance(tags, list):
        return tags[0]['term'].split('.')[0]
    return None

df['primary_category'] = df['tag'].apply(get_primary_category)

df = df.rename(columns={
    'summary': 'abstract',
    'title': 'title',
})

final_df = df[['title', 'abstract', 'primary_category']]

In [5]:
from sklearn.preprocessing import LabelEncoder

CATEGORY_MAPPING = {
    'cs': 'Computer Science',
    'stat': 'Statistics',
    'astro-ph': 'Astrophysics',
    'q-bio': 'Quantitative Biology',
    'eess': 'Electrical Engineering',
    'cond-mat': 'Condensed Matter',
    'math': 'Mathematics',
    'physics': 'Physics',
    'quant-ph': 'Quantum Physics',
    'q-fin': 'Quantitative Finance',
    'gr-qc': 'General Relativity',
    'nlin': 'Nonlinear Sciences',
    'cmp-lg': 'Computational Linguistics',
    'econ': 'Economics',
    'hep-ex': 'High Energy Physics - Experiment',
    'hep-th': 'High Energy Physics - Theory',
    'nucl-th': 'Nuclear Theory',
    'hep-ph': 'High Energy Physics - Phenomenology',
    'hep-lat': 'High Energy Physics - Lattice',
    'adap-org': 'Adaptation and Self-Organizing Systems'
}

def map_category_names(df, category_col='primary_category'):
    """Преобразует коды категорий в читаемые названия"""
    df['category_name'] = df[category_col].map(CATEGORY_MAPPING)
    return df

def encode_categories(df, category_col='primary_category'):
    """Кодирует категории в числовые метки"""
    le = LabelEncoder()
    df['category_encoded'] = le.fit_transform(df[category_col])

    import pickle
    pickle.dump(le, open('label_encoder.pkl', 'wb'))

    return df, le

final_df = map_category_names(final_df)
final_df, label_encoder = encode_categories(final_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_name'] = df[category_col].map(CATEGORY_MAPPING)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category_encoded'] = le.fit_transform(df[category_col])


In [6]:
def clean_text(text):
    text = re.sub(r'\$.*?\$', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower().strip()
    return text

final_df['title'] = final_df['title'].apply(clean_text)
final_df['abstract'] = final_df['abstract'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['title'] = final_df['title'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['abstract'] = final_df['abstract'].apply(clean_text)


In [7]:
no_abstract_df = final_df.copy()
no_abstract_df['abstract'] = ''
extended_df = pd.concat([final_df, no_abstract_df])

extended_df.head()

Unnamed: 0,title,abstract,primary_category,category_name,category_encoded
0,dual recurrent attention units for visual ques...,we propose an architecture for vqa which utili...,cs,Computer Science,4
1,sequential shorttext classification with recur...,recent approaches based on artificial neural n...,cs,Computer Science,4
2,multiresolution recurrent neural networks an a...,we introduce the multiresolution recurrent neu...,cs,Computer Science,4
3,learning what to share between loosely related...,multitask learning is motivated by the observa...,stat,Statistics,19
4,a deep reinforcement learning chatbot,we present milabot a deep reinforcement learni...,cs,Computer Science,4


In [8]:
train_df, temp_df = train_test_split(extended_df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Train: 65600, Val: 8200, Test: 8200


In [9]:
from torch.utils.data import Dataset

class ArxivDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        text = row['title']
        if pd.notna(row['abstract']) and row['abstract'].strip():
            text += " [SEP] " + row['abstract']

        inputs = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        inputs = {k: v.squeeze(0) for k, v in inputs.items()}

        label = torch.tensor(row['category_encoded'])

        return inputs, label

In [10]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(text, max_length=512):
    return tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
from torch.utils.data import DataLoader

train_dataset = ArxivDataset(train_df, tokenizer)
val_dataset = ArxivDataset(val_df, tokenizer)
test_dataset = ArxivDataset(test_df, tokenizer)

batch_size = 16

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2
)

In [12]:
from transformers import AutoModelForSequenceClassification
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(train_df['category_encoded'].unique())
).to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

epochs = 1
total_steps = len(train_loader) * epochs

optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [14]:
def compute_accuracy(preds, labels):
    return (preds.argmax(1) == labels).float().mean().item()

In [15]:
from tqdm import tqdm

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        inputs = {k: v.to(device) for k, v in batch[0].items()}
        labels = batch[1].to(device)

        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)

    model.eval()
    val_loss, val_acc = 0, 0

    with torch.no_grad():
        for batch in val_loader:
            inputs = {k: v.to(device) for k, v in batch[0].items()}
            labels = batch[1].to(device)

            outputs = model(**inputs, labels=labels)
            val_loss += outputs.loss.item()

            logits = outputs.logits
            val_acc += compute_accuracy(logits, labels)

    avg_val_loss = val_loss / len(val_loader)
    avg_val_acc = val_acc / len(val_loader)

    print(f"Epoch {epoch + 1}:")
    print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    print(f"Val Accuracy: {avg_val_acc * 100:.2f}%")

Epoch 1: 100%|██████████| 4100/4100 [48:46<00:00,  1.40it/s]


Epoch 1:
Train Loss: 0.4782 | Val Loss: 0.4208
Val Accuracy: 85.82%


In [17]:
import torch
import numpy as np

# Переводим модель в режим оценки
model.eval()

total_loss = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        # Перенос данных на устройство (GPU/CPU)
        inputs = {k: v.to(device) for k, v in batch[0].items()}
        labels = batch[1].to(device)

        # Прямой проход
        outputs = model(**inputs, labels=labels)

        # Считаем лосс
        total_loss += outputs.loss.item()

        # Получаем предсказания
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Вычисляем метрики
avg_loss = total_loss / len(test_loader)
accuracy = np.mean(np.array(all_preds) == np.array(all_labels))

print(f'Test Loss: {avg_loss:.4f}')
print(f'Test Accuracy: {accuracy*100:.2f}%')

Test Loss: 0.4154
Test Accuracy: 86.05%


In [18]:
from google.colab import files
import shutil

# Создаем временную папку
!mkdir /content/arxiv_classifier

# Сохраняем компоненты
model.save_pretrained('/content/arxiv_classifier')
tokenizer.save_pretrained('/content/arxiv_classifier')

# Создаем zip-архив
shutil.make_archive('arxiv_classifier', 'zip', '/content/arxiv_classifier')

# Скачиваем
files.download('arxiv_classifier.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [41]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pickle

# Определим устройство (GPU/CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Загружаем модель
my_model = AutoModelForSequenceClassification.from_pretrained('/content/arxiv_classifier').to(device)

# Загружаем токенизатор
my_tokenizer = AutoTokenizer.from_pretrained('/content/arxiv_classifier')

# Загружаем LabelEncoder
with open('label_encoder.pkl', 'rb') as f:
    my_label_encoder = pickle.load(f)

print("Модель успешно загружена!")

Модель успешно загружена!


In [33]:
import torch
import numpy as np
from typing import List, Dict, Optional

def predict_with_confidence(
    model,
    tokenizer,
    label_encoder,
    title: str,
    abstract: Optional[str] = None,
    max_length: int = 512,
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
) -> List[Dict[str, float]]:
    """
    Предсказывает категории статьи с накоплением вероятностей до 95%

    Args:
        model: обученная модель
        tokenizer: токенизатор
        label_encoder: кодировщик меток
        title: заголовок статьи (обязательно)
        abstract: аннотация статьи (опционально)
        max_length: максимальная длина текста
        device: устройство для вычислений

    Returns:
        Список словарей {'category': 'название', 'probability': вероятность}
        отсортированный по убыванию вероятности
    """
    text = title
    if abstract is not None and abstract.strip():
        text += " [SEP] " + abstract

    inputs = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    ).to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1).cpu().numpy()[0]

    sorted_indices = np.argsort(probs)[::-1]
    sorted_probs = probs[sorted_indices]
    sorted_labels = label_encoder.inverse_transform(sorted_indices)

    cumulative_probs = np.cumsum(sorted_probs)
    top_n = np.argmax(cumulative_probs >= 0.95) + 1

    result = []
    for i in range(top_n):
        result.append({
            'category': CATEGORY_MAPPING[sorted_labels[i]],
            'probability': float(sorted_probs[i])
        })

    return '\n'.join([f"{x['category']}: {(x['probability'] * 100):.1f}%" for x in result])

In [47]:
prediction = predict_with_confidence(
    model=my_model,
    tokenizer=my_tokenizer,
    label_encoder=my_label_encoder,
    title="Bombardiro crocodilo",
)

print(prediction)

Computer Science: 89.1%
Statistics: 6.4%
