### Решение вступительного на смену ML в Сириусе сенятбрь 2025

In [None]:
!pip install -q transformers torch pandas accelerate tqdm transformers_stream_generator

In [None]:
import pandas as pd
import numpy as np
import random
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.auto import tqdm

In [None]:
def set_seed(seed = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
DATA_PATH = '/kaggle/input/t-contest-nlp/'
data = pd.read_csv(DATA_PATH + 'train.csv')
data

In [None]:
from huggingface_hub import login
login(token="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") 

In [None]:
categories = [
    'бытовая техника',
    'обувь',
    'одежда',
    'посуда',
    'текстиль',
    'товары для детей',
    'украшения и аксессуары',
    'электроника',
    'нет товара'
]

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

data['label'] = ''

In [None]:
def classify_review(review_text):
    prompt = f"""
Классифицируй следующий отзыв по одной из категорий: {', '.join(categories)}.
Если отзыв нельзя отнести к одной из категорий с высокой вероятностью, выбери 'нет товара'.
Ответь только одним словом — названием категории, без лишних символов.

Отзыв: "{review_text}"
Категория:
    """.strip()

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=30,       
        do_sample=False,         
        temperature=0.0,
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(response)
    if "Категория:" in response:
        response = response.split("Категория:")[-1]
    response = response.strip().lower().replace("-", " ")
    for category in categories:
        if category.lower() in response:
            return category
    return "нет товара"


for index, row in data.iterrows():
    data.loc[index, 'label'] = classify_review(row['text'])
    print(f"Обработан отзыв {index+1}/{len(data)}. Метка: {data.loc[index, 'label']}")

data.to_csv('train_labeled.csv', index=False)

### Перезапускаю среду из-за Out of memory, загружаю полученные размеченные данные 

In [None]:
train_df = pd.read_csv('/kaggle/input/t-contest-nlp/train_labeled.csv')
train_df

In [None]:
!pip install -q peft datasets accelerate bitsandbytes

In [None]:
from datasets import Dataset

label2id = {c: i for i, c in enumerate(categories)}
id2label = {i: c for c, i in label2id.items()}
train_df['label'] = train_df['label'].map(label2id)

test_df = pd.read_csv('/kaggle/input/t-contest-nlp/test.csv')

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [None]:

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding=True, max_length=256)
    
train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

train_ds = train_ds.rename_column("label", "labels")

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])


In [None]:
from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model

base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(categories),
    id2label=id2label,
    label2id=label2id,
    torch_dtype=torch.float16,
    device_map="auto"
)

config = LoraConfig(
    r=16,
    lora_alpha=16, 
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(base_model, config)
model.print_trainable_parameters()


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    learning_rate=1e-4,
    num_train_epochs=1,
    logging_steps=100,
    save_strategy="no",
    report_to="none",
    seed=42,
    bf16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
test_df

### Снова Out of memory, сохраняю веса модели 

In [None]:
output_dir = "./my_finetuned_model"
trainer.save_model(output_dir)

In [None]:
import os
import zipfile

# Replace 'path/to/your/folder' with the actual path
folder_to_zip = './my_finetuned_model' 
output_zip_file = './my_finetuned_model.zip'

# Check if the folder exists
if not os.path.isdir(folder_to_zip):
    print(f"Directory '{folder_to_zip}' does not exist.")
else:
    # Create the zip file
    with zipfile.ZipFile(output_zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through the directory and add files to the zip
        for root, dirs, files in os.walk(folder_to_zip):
            for file in files:
                # Create a relative path to keep the directory structure inside the zip
                relative_path = os.path.relpath(os.path.join(root, file), folder_to_zip)
                zipf.write(os.path.join(root, file), relative_path)
    print(f"Successfully created '{output_zip_file}'. You can now download it.")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

model_path = "/kaggle/input/t-contest-nlp/my_finetuned_model"

loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)

loaded_model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=len(categories),
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./results",
)

new_trainer = Trainer(
    model=loaded_model,
    args=training_args,
    tokenizer=loaded_tokenizer
)

In [None]:
predictions = trainer.predict(test_dataset=test_ds)

logits = predictions.predictions

predicted_ids = torch.argmax(torch.tensor(logits), axis=-1).cpu().numpy()

predicted_labels = [id2label[id] for id in predicted_ids]

submission_df = pd.DataFrame({'id': test_df['id'], 'label': predicted_labels})

submission_df.to_csv('submission.csv', index=False)

print("Файл submission.csv успешно создан.")