In [1]:
import wandb

wandb.login(key='b123af3ff1bc7e54569d0976c6405a5b3b6d2902')

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import re

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
def preprocess_text(text):
    """
    Preprocess text to retain only alphabetic characters and convert to lowercase.
    
    Parameters:
        text (str): Input string.
        
    Returns:
        str: Preprocessed string.
    """
    text = re.sub(r"\s+", " ", text).strip()
    text = text.lower()
    return text


def all_preprocessing(df):
    df['text'] = df['text'].apply(preprocess_text)
    return df

def extract_words_from_mask(df, text_col='text', mask_col='label_new'):
    """
    Extracts substrings from `text` based on the boolean mask in `label_new`.
    
    Parameters:
        df (pd.DataFrame): Input dataframe with columns `text` and `label_new`.
        text_col (str): Name of the column containing the text.
        mask_col (str): Name of the column containing the boolean mask.
        
    Returns:
        pd.Series: A Series where each entry is a list of words extracted from `text`.
    """
    def process_row(row):
        text = row[text_col]
        mask = row[mask_col]
        words = []
        current_word = []
        
        for char, include in zip(text, mask):
            if include:
                current_word.append(char)
            elif current_word:  # If a word is in progress and we hit a `0`
                words.append("".join(current_word))
                current_word = []  # Reset for the next word
        
        if current_word:  # Append the last word if still in progress
            words.append("".join(current_word))
        words = sorted(words)
        return ",".join(words)
    
    return df.apply(process_row, axis=1)


def to_out_labels(texts, labels):
    def process_row(i):
        text = texts[i]
        mask = labels[i]
        words = []
        current_word = []
        
        for char, include in zip(text, mask):
            if include:
                current_word.append(char)
            elif current_word:  # If a word is in progress and we hit a `0`
                words.append("".join(current_word))
                current_word = []  # Reset for the next word
        
        if current_word:  # Append the last word if still in progress
            words.append("".join(current_word))

        words = sorted(words)
        return ",".join(words)
    out_labels = [process_row(i) for i in range(len(texts))]
    return out_labels

In [4]:
import ast

df = pd.read_csv('/kaggle/input/wb-contest-2/wb_contest_2_new_dataset.csv', index_col='ID')
df['label'] = df['label'].apply(lambda x: ast.literal_eval(x))

df.tail()

Unnamed: 0_level_0,text,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
248088,мне ее порвали суки,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
248089,"полное дерьмо, удалите этот товар и заблокируй...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, ..."
248090,херня. деньги на ветер.,"[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
248091,"это вообще что , за 💩 гов... ще?? темнотища уж...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
248092,"не берите!!!!! мелкие, порезанные, подпорченны...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [5]:
# Custom Tokenizer: Split into characters and map each character to an index
class CharTokenizer:
    def __init__(self, texts):
        # Create a vocabulary from the unique characters in the dataset + a padding token (0)
        self.vocab = {char: idx + 1 for idx, char in enumerate(set(''.join(texts)))}  # index 0 is for padding
        self.vocab_size = len(self.vocab) + 1  # Add 1 for padding
        self.vocab['<PAD>'] = 0  # Adding padding token
        self.reverse_vocab = {v: k for k, v in self.vocab.items()}

    def encode(self, text):
        # Return the indices of each character in the text based on the vocabulary
        return [self.vocab.get(char, self.vocab['<PAD>']) for char in text]

    def decode(self, token_ids):
        # Decode token IDs back to characters
        return ''.join([self.reverse_vocab.get(idx, '<PAD>') for idx in token_ids])

# Custom Dataset
class TokenClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=1024):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text into character-level tokens
        input_ids = self.tokenizer.encode(text)
        
        # Pad the sequence to max_len if necessary
        input_ids = input_ids + [0] * (self.max_len - len(input_ids)) if len(input_ids) < self.max_len else input_ids[:self.max_len]
        
        # Pad the labels to max_len if necessary
        labels_padded = label + [0] * (self.max_len - len(label)) if len(label) < self.max_len else label[:self.max_len]
        #print(sum(labels_padded))
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor([1] * len(input_ids) + [0] * (self.max_len - len(input_ids)), dtype=torch.long),  # Attention mask
            'labels': torch.tensor(labels_padded, dtype=torch.long)
        }

class InferenceDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=1024):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        
        input_ids = self.tokenizer.encode(text)
        input_ids = input_ids + [0] * (self.max_len - len(input_ids)) if len(input_ids) < self.max_len else input_ids[:self.max_len]

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor([1] * len(input_ids) + [0] * (self.max_len - len(input_ids)), dtype=torch.long)  # Attention mask
        }

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CharTokenClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, transformer_dim=512, num_heads=8, num_layers=6, num_classes=2, dropout=0.2):
        super(CharTokenClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Character embeddings
        self.positional_encoding = nn.Parameter(torch.zeros(1, 1024, embedding_dim))  # Learnable positional encoding
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, dim_feedforward=transformer_dim, dropout=dropout)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(embedding_dim, num_classes)  # Output classification layer

    def forward(self, input_ids, attention_mask):
        # Get embeddings and add positional encoding
        x = self.embedding(input_ids)  # (batch_size, seq_len, embedding_dim)
        x = x + self.positional_encoding[:, :x.size(1), :]
        
        # Apply attention mask (if needed)
        src_key_padding_mask = ~attention_mask.bool() if attention_mask is not None else None
        
        # Pass through the Transformer
        x = self.transformer(x.transpose(0, 1), src_key_padding_mask=src_key_padding_mask)  # (seq_len, batch_size, embedding_dim)
        x = x.transpose(0, 1)  # (batch_size, seq_len, embedding_dim)
        x = self.dropout(x)
        
        # Apply classifier on each token
        logits = self.classifier(x)  # (batch_size, seq_len, num_classes)
        return logits


In [7]:
texts = df['text'].values  # or df['text'].to_numpy()
labels = df['label'].values  # or df['label'].to_numpy()

tokenizer = CharTokenizer(texts)
print(f"Vocabulary size: {tokenizer.vocab_size}")

# Create the dataset and dataloader
dataset = TokenClassificationDataset(texts, labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

Vocabulary size: 1131


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler

def train_model(model, train_loader, epochs=10, lr=5e-5, warmup_steps=1000, max_grad_norm=1.0, device='cuda'):
    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    
    # Learning rate scheduler with warmup
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
    
    # Mixed precision scaler
    scaler = GradScaler()

    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", unit="batch"):
            input_ids, labels, attention_mask = batch["input_ids"], batch["labels"], batch["attention_mask"]
            input_ids, labels, attention_mask = input_ids.to(device), labels.to(device), attention_mask.to(device)

            optimizer.zero_grad()

            # Mixed precision forward pass
            with autocast():
                logits = model(input_ids, attention_mask)
                loss = criterion(logits.view(-1, 2), labels.view(-1))

            # Backward pass with mixed precision
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            
            # Optimizer step with scaler
            scaler.step(optimizer)
            scaler.update()
            
            # Scheduler step
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}")

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model, loss function, and optimizer
model = CharTokenClassifier(vocab_size=tokenizer.vocab_size, embedding_dim=64)
model.to(device)

train_model(model, dataloader, device=device)

  scaler = GradScaler()
  with autocast():
Epoch 1/10: 100%|██████████| 3825/3825 [46:22<00:00,  1.37batch/s]


Epoch 1/10, Train Loss: 0.0176


Epoch 2/10: 100%|██████████| 3825/3825 [47:53<00:00,  1.33batch/s]


Epoch 2/10, Train Loss: 0.0038


Epoch 3/10: 100%|██████████| 3825/3825 [48:08<00:00,  1.32batch/s]


Epoch 3/10, Train Loss: 0.0034


Epoch 4/10: 100%|██████████| 3825/3825 [48:09<00:00,  1.32batch/s]


Epoch 4/10, Train Loss: 0.0033


Epoch 5/10: 100%|██████████| 3825/3825 [48:00<00:00,  1.33batch/s]


Epoch 5/10, Train Loss: 0.0032


Epoch 6/10: 100%|██████████| 3825/3825 [48:04<00:00,  1.33batch/s]


Epoch 6/10, Train Loss: 0.0032


Epoch 7/10: 100%|██████████| 3825/3825 [47:45<00:00,  1.33batch/s]


Epoch 7/10, Train Loss: 0.0031


Epoch 8/10: 100%|██████████| 3825/3825 [46:44<00:00,  1.36batch/s]


Epoch 8/10, Train Loss: 0.0031


Epoch 9/10: 100%|██████████| 3825/3825 [46:32<00:00,  1.37batch/s]


Epoch 9/10, Train Loss: 0.0031


Epoch 10/10: 100%|██████████| 3825/3825 [47:26<00:00,  1.34batch/s]

Epoch 10/10, Train Loss: 0.0031





In [10]:
torch.save(model.state_dict(), '/kaggle/working/model.pth')

import pickle
with open('/kaggle/working/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
"""

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model, loss function, and optimizer
model = CharTokenClassifier(vocab_size=tokenizer.vocab_size, embedding_dim=64)
model.load_state_dict(torch.load('/kaggle/working/model.pth', weights_only=True))
model.to(device)
model.eval()  # Set the model to evaluation mode

# Function to run inference on the entire dataset or a sample
def run_eval(dataloader):
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation for inference
        for batch in tqdm(dataloader, desc="Running Inference"):
            # Move batch data to the correct device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Run the model on the batch
            logits = model(input_ids, attention_mask)  # (batch_size, seq_len, num_classes)

            # Apply softmax to get class probabilities
            probs = torch.softmax(logits, dim=-1)  # Shape: (batch_size, seq_len, num_classes)

            # Get predicted class for each token
            predicted_class = torch.argmax(probs, dim=-1)  # Shape: (batch_size, seq_len)

            # Store predictions and labels
            for i in range(len(labels)):
                all_preds.append(predicted_class[i].cpu().numpy())  # Move to CPU for easy handling
                all_labels.append(labels[i].cpu().numpy())

    return all_preds, all_labels

# Run inference
#predictions, ground_truth = run_eval(dataloader)

"""

'\n\ndevice = torch.device(\'cuda\' if torch.cuda.is_available() else \'cpu\')\n\n# Initialize the model, loss function, and optimizer\nmodel = CharTokenClassifier(vocab_size=tokenizer.vocab_size, embedding_dim=64)\nmodel.load_state_dict(torch.load(\'/kaggle/working/model.pth\', weights_only=True))\nmodel.to(device)\nmodel.eval()  # Set the model to evaluation mode\n\n# Function to run inference on the entire dataset or a sample\ndef run_eval(dataloader):\n    all_preds = []\n    all_labels = []\n\n    with torch.no_grad():  # Disable gradient computation for inference\n        for batch in tqdm(dataloader, desc="Running Inference"):\n            # Move batch data to the correct device\n            input_ids = batch[\'input_ids\'].to(device)\n            attention_mask = batch[\'attention_mask\'].to(device)\n            labels = batch[\'labels\'].to(device)\n\n            # Run the model on the batch\n            logits = model(input_ids, attention_mask)  # (batch_size, seq_len, num_cl

In [12]:
"""
i = 244736
text = dataset.texts[i]
# Example: Print the first prediction and corresponding ground truth
print(f"Text: {text}")
print(f"Prediction: {predictions[i]}")
print(f"Ground truth: {ground_truth[i]}")

predicted = pd.DataFrame({"text": dataset.texts, "prediction": predictions, "label": ground_truth})
predicted['label_out'] = extract_words_from_mask(predicted, mask_col='prediction')

predicted
"""

'\ni = 244736\ntext = dataset.texts[i]\n# Example: Print the first prediction and corresponding ground truth\nprint(f"Text: {text}")\nprint(f"Prediction: {predictions[i]}")\nprint(f"Ground truth: {ground_truth[i]}")\n\npredicted = pd.DataFrame({"text": dataset.texts, "prediction": predictions, "label": ground_truth})\npredicted[\'label_out\'] = extract_words_from_mask(predicted, mask_col=\'prediction\')\n\npredicted\n'

In [13]:
# predicted.to_csv("/kaggle/working/predicted.csv")

In [14]:
"""
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model, loss function, and optimizer
model = CharTokenClassifier(vocab_size=tokenizer.vocab_size, embedding_dim=64)
model.load_state_dict(torch.load('/kaggle/working/model.pth', weights_only=True))
model.to(device)
model.eval()

import pickle
with open('/kaggle/input/working/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
"""

"\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n\n# Initialize the model, loss function, and optimizer\nmodel = CharTokenClassifier(vocab_size=tokenizer.vocab_size, embedding_dim=64)\nmodel.load_state_dict(torch.load('/kaggle/working/model.pth', weights_only=True))\nmodel.to(device)\nmodel.eval()\n\nimport pickle\nwith open('/kaggle/input/working/tokenizer.pickle', 'rb') as handle:\n    tokenizer = pickle.load(handle)\n"

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

df_test = pd.read_csv('/kaggle/input/wildberries-winter-school-24-contest-detected/test.csv')
df_test = all_preprocessing(df_test)

texts = df_test['text'].values  # or df['text'].to_numpy()

In [16]:
dataset_test = InferenceDataset(texts, tokenizer)
dataloader_test = DataLoader(dataset_test, batch_size=64, shuffle=False)

def run_inference(model, dataloader, device):
    all_preds = []

    with torch.no_grad():  # Disable gradient computation for inference
        for batch in tqdm(dataloader, desc="Running Inference"):
            # Move batch data to the correct device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Run the model on the batch
            logits = model(input_ids, attention_mask)  # (batch_size, seq_len, num_classes)

            # Apply softmax to get class probabilities
            probs = torch.softmax(logits, dim=-1)  # Shape: (batch_size, seq_len, num_classes)

            # Get predicted class for each token
            predicted_class = torch.argmax(probs, dim=-1)  # Shape: (batch_size, seq_len)

            # Store predictions and labels
            for i in range(len(predicted_class)):
                all_preds.append(predicted_class[i].cpu().numpy())  # Move to CPU for easy handling

    return all_preds

# Run inference
predictions = run_inference(model, dataloader_test, device)

df_test["label_new"] = predictions
df_test.to_csv('/kaggle/working/out.csv')

Running Inference: 100%|██████████| 1047/1047 [03:51<00:00,  4.53it/s]


In [17]:
df_test['label'] = extract_words_from_mask(df_test)

df_test.to_csv('/kaggle/working/out_with_words.csv')
df_test

Unnamed: 0,ID,text,label_new,label
0,0,"хороший, подошкл","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
1,1,"совсем тонюсенький саженец, не досмотрела в оп...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
2,2,"когтеточка хорошая, но вот ткань на основании ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
3,3,"много затяжек, не порадовала покупка","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
4,4,рекомендую 💣,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
...,...,...,...,...
66944,71995,пачка как пачка а внутри совсем другое некторв...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
66945,71996,"отвратительное качество!!! через год тряпка, у...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
66946,71997,"вес 100гр, не понимаю откуда хорошие отзывы , ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
66947,71998,"приобрел и установил радиатор год назад , авто...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",


In [18]:
df_test.drop(columns=['label_new', 'text']).set_index("ID").to_csv("/kaggle/working/final.csv")