<a href="https://colab.research.google.com/github/Kvazzzzar/MPSI/blob/main/MPSI_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U transformers datasets evaluate accelerate nltk scikit-learn

import re
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import evaluate
import torch



In [2]:
pip install hf_xet

[31mERROR: Operation cancelled by user[0m[31m
[0m

In [5]:
# Инициализация ресурсов NLTK
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

class TextDataProcessor:
    """Класс для обработки текстовых данных"""

    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.df = None

    def load_20newsgroups(self, subset='train'):
        """Загрузка датасета 20 Newsgroups из sklearn"""
        newsgroups = fetch_20newsgroups(
            subset=subset,
            remove=('headers', 'footers', 'quotes')
        )

        self.df = pd.DataFrame({
            'text': newsgroups.data,
            'category': [newsgroups.target_names[t] for t in newsgroups.target],
            'label': newsgroups.target
        })

        print(f"Загружено {len(self.df)} записей")
        print("\nПример данных:")
        print(self.df.head())
        return self.df

    def preprocess_text(self, text):
        """Предобработка текста"""
        if not isinstance(text, str):
            return ""

        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        processed_tokens = [
            self.lemmatizer.lemmatize(token)
            for token in tokens
            if token not in self.stop_words and len(token) > 2
        ]
        return ' '.join(processed_tokens)

    def apply_preprocessing(self):
        """Применение предобработки"""
        if self.df is None:
            raise ValueError("Сначала загрузите данные")

        print("\nПрименение предобработки текста...")
        self.df['processed_text'] = self.df['text'].apply(self.preprocess_text)
        self.df = self.df[self.df['processed_text'].str.len() > 0]
        print(f"Осталось {len(self.df)} записей после очистки")
        return self.df

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [6]:
# 1. Загрузка и предобработка данных
processor = TextDataProcessor()
df = processor.load_20newsgroups(subset='train')
df = processor.apply_preprocessing()

# 2. Подготовка данных
categories = df['category'].unique()
id2label = {i: label for i, label in enumerate(categories)}
label2id = {label: i for i, label in id2label.items()}

train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

print(f"\nРазмеры выборок:")
print(f"Train: {len(train_df)}")
print(f"Val: {len(val_df)}")
print(f"Test: {len(test_df)}")

# 3. Токенизация
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["processed_text"], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_df[['processed_text', 'label']])
val_dataset = Dataset.from_pandas(val_df[['processed_text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['processed_text', 'label']])

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# 4. Загрузка модели
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(categories),
    id2label=id2label,
    label2id=label2id
)

# 5. Настройка обучения (обновленные параметры)
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Изменено с evaluation_strategy на eval_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

# 6. Обучение
print("\nНачинаем обучение...")
trainer.train()

# 7. Оценка
print("\nОценка на тестовом наборе:")
test_results = trainer.evaluate(tokenized_test)
print(f"Точность: {test_results['eval_accuracy']:.4f}")

# 8. Примеры предсказаний
sample_texts = [
    "The new GPU provides amazing performance for gaming",
    "Christians celebrate Easter as resurrection day",
    "NASA discovered water on Mars surface",
    "This motherboard supports DDR5 memory technology",
    "Buddhists practice meditation daily"
]

print("\nПримеры предсказаний:")
for text in sample_texts:
    try:
        processed = processor.preprocess_text(text)
        inputs = tokenizer(processed, return_tensors="pt", padding=True, truncation=True, max_length=128)

        with torch.no_grad():
            outputs = model(**inputs)

        prediction = outputs.logits.argmax().item()
        print(f"\nТекст: {text}")
        print(f"Предсказанная категория: {id2label[prediction]}")
    except Exception as e:
        print(f"\nОшибка при обработке текста: {text}\n{str(e)}")

Загружено 11314 записей

Пример данных:
                                                text               category  \
0  I was wondering if anyone out there could enli...              rec.autos   
1  A fair number of brave souls who upgraded thei...  comp.sys.mac.hardware   
2  well folks, my mac plus finally gave up the gh...  comp.sys.mac.hardware   
3  \nDo you have Weitek's address/phone number?  ...          comp.graphics   
4  From article <C5owCB.n3p@world.std.com>, by to...              sci.space   

   label  
0      7  
1      4  
2      4  
3      1  
4     14  

Применение предобработки текста...
Осталось 10996 записей после очистки

Размеры выборок:
Train: 7697
Val: 1649
Test: 1650


Map:   0%|          | 0/7697 [00:00<?, ? examples/s]

Map:   0%|          | 0/1649 [00:00<?, ? examples/s]

Map:   0%|          | 0/1650 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]




Начинаем обучение...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:


Abort: 