<a href="https://colab.research.google.com/github/Kvazzzzar/MPSI/blob/main/MPSI_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U transformers datasets evaluate accelerate nltk scikit-learn

import re
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import evaluate
import torch



In [2]:
pip install hf_xet

[31mERROR: Operation cancelled by user[0m[31m
[0m

In [10]:
# Инициализация ресурсов NLTK
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

class TextDataProcessor:
    """Класс для обработки текстовых данных с уменьшенными выборками"""

    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.df = None

    def load_20newsgroups(self, subset='train', sample_size=1000):
        """Загрузка датасета с ограничением размера выборки"""
        newsgroups = fetch_20newsgroups(
            subset=subset,
            remove=('headers', 'footers', 'quotes')
        )

        # Создаем DataFrame и ограничиваем размер выборки
        self.df = pd.DataFrame({
            'text': newsgroups.data,
            'category': [newsgroups.target_names[t] for t in newsgroups.target],
            'label': newsgroups.target
        }).sample(min(sample_size, len(newsgroups.data)), random_state=42)

        print(f"Загружено {len(self.df)} записей (ограничение выборки)")
        print("\nПример данных:")
        print(self.df.head())
        return self.df

    def preprocess_text(self, text):
        """Предобработка текста"""
        if not isinstance(text, str):
            return ""

        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        processed_tokens = [
            self.lemmatizer.lemmatize(token)
            for token in tokens
            if token not in self.stop_words and len(token) > 2
        ]
        return ' '.join(processed_tokens)

    def apply_preprocessing(self):
        """Применение предобработки"""
        if self.df is None:
            raise ValueError("Сначала загрузите данные")

        print("\nПрименение предобработки текста...")
        self.df['processed_text'] = self.df['text'].apply(self.preprocess_text)
        self.df = self.df[self.df['processed_text'].str.len() > 0]
        print(f"Осталось {len(self.df)} записей после очистки")
        return self.df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [15]:
# 1. Загрузка и предобработка данных с ограниченной выборкой
processor = TextDataProcessor()
df = processor.load_20newsgroups(subset='train', sample_size=1000)  # Ограничиваем выборку
df = processor.apply_preprocessing()

# 2. Подготовка данных
categories = df['category'].unique()
id2label = {i: label for i, label in enumerate(categories)}
label2id = {label: i for i, label in id2label.items()}

# Разделение на train/val/test (80/10/10)
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

print(f"\nРазмеры выборок:")
print(f"Train: {len(train_df)} (~800 samples)")
print(f"Val: {len(val_df)} (~100 samples)")
print(f"Test: {len(test_df)} (~100 samples)")

# 3. Токенизация
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["processed_text"], padding="max_length", truncation=True, max_length=64)  # Уменьшили длину

train_dataset = Dataset.from_pandas(train_df[['processed_text', 'label']])
val_dataset = Dataset.from_pandas(val_df[['processed_text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['processed_text', 'label']])

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# 4. Загрузка модели
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(categories),
    id2label=id2label,
    label2id=label2id
)

# 5. Настройка обучения (оптимизировано для быстрого обучения)
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Уменьшили размер батча
    per_device_eval_batch_size=8,
    num_train_epochs=2,  # Уменьшили количество эпох
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

# 6. Обучение
print("\nНачинаем обучение (это займет несколько минут)...")
trainer.train()

# 7. Оценка
print("\nОценка на тестовом наборе:")
test_results = trainer.evaluate(tokenized_test)
print(f"Точность: {test_results['eval_accuracy']:.4f}")

# 8. Примеры предсказаний
sample_texts = [
    "New graphics card has 12GB memory and ray tracing",
    "Jesus Christ is the son of God in Christianity",
    "Mars rover found evidence of ancient water",
    "This processor has 8 cores and 16 threads",
    "Meditation is central to Buddhist practice"
]

print("\nПримеры предсказаний:")
for text in sample_texts:
    processed = processor.preprocess_text(text)
    inputs = tokenizer(processed, return_tensors="pt", padding=True, truncation=True, max_length=64)

    with torch.no_grad():
        outputs = model(**inputs)

    prediction = outputs.logits.argmax().item()
    print(f"\nТекст: {text}")
    print(f"Предсказанная категория: {id2label[prediction]}")

Загружено 1000 записей (ограничение выборки)

Пример данных:
                                                   text  \
7492  Could someone please post any info on these sy...   
3546  \n\n     Don't bother if you have CPBackup or ...   
5582  5.25" Internal Low density disk drive.\n\nMono...   
4793  Hi,\n\nIn Canada, any gun that enters a Nation...   
3813  \nDoesn't it also have the Statue of Liberty o...   

                     category  label  
7492    comp.sys.mac.hardware      4  
3546  comp.os.ms-windows.misc      2  
5582             misc.forsale      6  
4793       talk.politics.guns     16  
3813         rec.sport.hockey     10  

Применение предобработки текста...
Осталось 978 записей после очистки

Размеры выборок:
Train: 782 (~800 samples)
Val: 98 (~100 samples)
Test: 98 (~100 samples)


Map:   0%|          | 0/782 [00:00<?, ? examples/s]

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Начинаем обучение (это займет несколько минут)...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mwarrage[0m ([33mwarrage-org[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.722158,0.336735
2,No log,2.552697,0.336735



Оценка на тестовом наборе:


Точность: 0.3061

Примеры предсказаний:

Текст: New graphics card has 12GB memory and ray tracing
Предсказанная категория: rec.sport.hockey

Текст: Jesus Christ is the son of God in Christianity
Предсказанная категория: talk.politics.mideast

Текст: Mars rover found evidence of ancient water
Предсказанная категория: talk.religion.misc

Текст: This processor has 8 cores and 16 threads
Предсказанная категория: rec.sport.hockey

Текст: Meditation is central to Buddhist practice
Предсказанная категория: comp.graphics
