In [103]:
import pathlib
import random
import pandas as pd
import numpy as np
import sys
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict

from sklearn.metrics import (
    f1_score, 
    accuracy_score,
    classification_report, 
)

ROOT_DIR = pathlib.Path().absolute()
DATA_DIR = ROOT_DIR / "data"
RANDOM_SEED = 42

## Загрузка и обзор данных

In [160]:
df_trends = pd.read_csv(DATA_DIR / "trends_description.csv")
df = pd.read_csv(DATA_DIR / "train.csv")
df_test = pd.read_csv(DATA_DIR / "test.csv")

In [161]:
for i in range(50):
    df.rename(columns={f"trend_id_res{i}": i}, inplace=True)
df["labels"] = df.drop('index',axis=1).drop('assessment',axis=1).drop('tags',axis=1).drop('text',axis=1).drop('Unnamed: 0',axis=1).apply(lambda r: r.index[r.ne(0)].to_list(), axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,0,1,2,3,4,...,41,42,43,44,45,46,47,48,49,labels
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[28]
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[18]
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,[3]
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[12, 28]"
4,5,1428,6.0,"{PRICE,PROMOTIONS}",Можно немного скинуть минимальную сумму заказа...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[8]


## Обучение моделей

### Предобработка данных

In [162]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,0,1,2,3,4,...,41,42,43,44,45,46,47,48,49,labels
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[28]
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[18]
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,[3]
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[12, 28]"
4,5,1428,6.0,"{PRICE,PROMOTIONS}",Можно немного скинуть минимальную сумму заказа...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[8]


In [163]:
X, y = df[["text"]], df['labels']
X = X.astype("str").copy()
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size= 0.8, random_state = 42)
print(f"X_train.shape is {X_train.shape}")
print(f"y_train.shape is {y_train.shape}")
print(f"X_val.shape is {X_val.shape}")
print(f"y_val.shape is {y_val.shape}")
X_test = df_test[["text"]]
print(f"X_test.shape is {X_test.shape}")
trend_info = pd.read_csv(DATA_DIR / "trends_description.csv")


X_train.shape is (3698, 1)
y_train.shape is (3698,)
X_val.shape is (925, 1)
y_val.shape is (925,)
X_test.shape is (9015, 1)


In [164]:
X.shape


(4623, 1)

In [165]:
X_train

Unnamed: 0,text
1538,"Ну, за [NUM]ч. и [NUM] мин. мне ещё никогда не..."
2991,Доставка всегда осуществляется значительно дол...
2812,Задержка доставки
4515,"Отличный сервис, только бы ассортимент расшири..."
4531,"Поддержка говно, курьеры опаздывают минут на [..."
...,...
4426,"+ быстро. - иногда сумма заказа очень велика, ..."
466,🦉
3092,До самоката я тратил меньше денег в день
3772,О вас редко думаю. Напрягает СТМ. Когда непоня...


In [166]:

categories = []
cat2idx={}
count=0
for i in trend_info['trend']:
    categories.append(i)
    cat2idx[i]=count
    count+=1
categories
cat2idx

{'Долгая доставка': 0,
 'Доставка стала долгой': 1,
 'Время доставки не соответствует заявленому': 2,
 'Регулярные опоздания': 3,
 'Не отследить реальное время доставки': 4,
 'Курьер на карте': 5,
 'Нет доставки по адресу': 6,
 'Не предупреждаем об удалении товара': 7,
 'Высокая минимальная сумма заказа': 8,
 'Сумма заказа меняется во время набора корзины': 9,
 'Минимальная сумма заказа': 10,
 'Товары с подходящим сроком годности': 11,
 'Высокие цены': 12,
 'Не довезли товар': 13,
 'Товар испорчен во время доставки': 14,
 'Просроченные товары': 15,
 'Замечания по работе курьеров': 16,
 'Не читаем комментарии': 17,
 'Спасибо': 18,
 'Нет смысла': 19,
 'Всё нормально': 20,
 'Всё плохо': 21,
 'Скидки для постоянных клиентов': 22,
 'Больше акций/скидок': 23,
 'Скидка/промокод распространяется не на все товары': 24,
 'Непонятно как работает скидка': 25,
 'Не сработала скидка/акция/промокод': 26,
 'Качество товаров': 27,
 'Маленький ассортимент': 28,
 'Нет в наличии товара': 29,
 'Качество по

In [167]:
import re
import string

def lower_text(text: str):
    return text.lower()

def remove_numbers(text: str):
    """
    Substitute all punctuations with space in case of
    "there is5dogs".

    If subs with '' -> "there isdogs"
    With ' ' -> there is dogs
    """
    text_nonum = re.sub(r'\d+', ' ', text)
    return text_nonum

def remove_punctuation(text: str):
    """
    Substitute all punctuations with space.

    Example:
    "hello!nice to meet you" -> "hello nice to meet you"
    """
    # Create a translation table that maps each punctuation to a space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))

    # Use translate to substitute punctuation with spaces
    text_nopunct = text.translate(translator)

    return text_nopunct



def remove_multiple_spaces(text: str):
    """
    Replaces multiple spaces with a single space.

    Example:
    "This  is   a   test" -> "This is a test"
    """
    # Use regex to substitute multiple spaces with a single space
    text_no_doublespace = re.sub(r'\s+', ' ', text).strip()

    return text_no_doublespace

In [168]:
sample_text = X_train['text'][22]

_lowered = lower_text(sample_text)
_without_numbers = remove_numbers(_lowered)
_without_punct = remove_punctuation(_without_numbers)
_single_spaced = remove_multiple_spaces(_without_punct)

print(sample_text)
print('-'*10)
print(_lowered)
print('-'*10)
print(_without_numbers)
print('-'*10)
print(_without_punct)
print('-'*10)
print(_single_spaced)

Заказ на сумму от [NUM]₽ это дорого, было бы лучше как было раньше [NUM]₽
----------
заказ на сумму от [num]₽ это дорого, было бы лучше как было раньше [num]₽
----------
заказ на сумму от [num]₽ это дорого, было бы лучше как было раньше [num]₽
----------
заказ на сумму от  num ₽ это дорого  было бы лучше как было раньше  num ₽
----------
заказ на сумму от num ₽ это дорого было бы лучше как было раньше num ₽


In [169]:
# Importing necessary modules from NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
# Tokenization function
def tokenize_text(text: str) -> list[str]:
    """
    Tokenizes input text into a list of words.
    """
    return nltk.tokenize.word_tokenize(text)

# Remove stopwords function
def remove_stop_words(tokenized_text: list[str]) -> list[str]:
    """
    Removes stopwords from a list of tokenized words.
    """
    # Ensure stopwords are downloaded
    nltk.download('stopwords')
    stop_words = set(stopwords.words('russian'))

    return [word for word in tokenized_text if word.lower() not in stop_words]

# Stemming function
def stem_words(tokenized_text: list[str]) -> list[str]:
    """
    Stems words in the tokenized text using PorterStemmer.
    """
    stemmer = SnowballStemmer("russian") 
    return [stemmer.stem(word) for word in tokenized_text]




[nltk_data] Downloading package punkt to
[nltk_data]     c:\Users\Katie\miniconda3\envs\mltest\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [170]:
_tokenized = tokenize_text(_single_spaced)
_without_sw = remove_stop_words(_tokenized)
_stemmed = stem_words(_without_sw)

print(_single_spaced)
print('-'*10)
print(_tokenized)
print('-'*10)
print(_without_sw)
print('-'*10)
print(_stemmed)

заказ на сумму от num ₽ это дорого было бы лучше как было раньше num ₽
----------
['заказ', 'на', 'сумму', 'от', 'num', '₽', 'это', 'дорого', 'было', 'бы', 'лучше', 'как', 'было', 'раньше', 'num', '₽']
----------
['заказ', 'сумму', 'num', '₽', 'это', 'дорого', 'раньше', 'num', '₽']
----------
['заказ', 'сумм', 'num', '₽', 'эт', 'дор', 'раньш', 'num', '₽']


[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\Katie\miniconda3\envs\mltest\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [171]:
def preprocessing_stage(text):
    _lowered = lower_text(text)
    _without_numbers = remove_numbers(_lowered)
    _without_punct = remove_punctuation(_without_numbers)
    _single_spaced = remove_multiple_spaces(_without_punct)
    _tokenized = tokenize_text(_single_spaced)
    _without_sw = remove_stop_words(_tokenized)
    _stemmed = stem_words(_without_sw)

    return _stemmed

def clean_text_inplace(df):
    df['text'] = df['text'].apply(preprocessing_stage)
    return df
def preprocess(df):

    df.fillna(" ", inplace=True)
    _cleaned = clean_text_inplace(df)
    
    #for i in range(len(df)):
    #    if i==8:
    #        continue
        #print(df['text'][i])
    #    df['text'][i]  = preprocessing_stage(df['text'][i])
    #    print(df['text'][i])
        
    return _cleaned




In [172]:
X_train['text']

1538    Ну, за [NUM]ч. и [NUM] мин. мне ещё никогда не...
2991    Доставка всегда осуществляется значительно дол...
2812                                    Задержка доставки
4515    Отличный сервис, только бы ассортимент расшири...
4531    Поддержка говно, курьеры опаздывают минут на [...
                              ...                        
4426    + быстро. - иногда сумма заказа очень велика, ...
466                                                     🦉
3092             До самоката я тратил меньше денег в день
3772    О вас редко думаю. Напрягает СТМ. Когда непоня...
860     Доставка стала очень долгая, цены завышены чем...
Name: text, Length: 3698, dtype: object

In [173]:
train_preprocessed = preprocess(X)
test_preprocessed = preprocess(X_test)

train_preprocessed.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\Katie\miniconda3\envs\mltest\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\Katie\miniconda3\envs\mltest\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\Katie\miniconda3\envs\mltest\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\Katie\miniconda3\envs\mltest\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\Katie\miniconda3\envs\mltest\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\Katie\miniconda3\envs\mltest\nltk_data...
[nltk_data]   Package stopwords is already up-to-date

Unnamed: 0,text
0,"[маленьк, выбор, товар, хотел, ассортимент, врод]"
1,[быстр]
2,"[доставк, постоя, задержива]"
3,"[наценк, ассортимент, расстраива]"
4,"[немн, скинут, минимальн, сумм, заказ, оплачив..."


In [174]:
df['assessment']

0       6.0
1       4.0
2       6.0
3       6.0
4       6.0
       ... 
4618    2.0
4619    6.0
4620    6.0
4621    2.0
4622    3.0
Name: assessment, Length: 4623, dtype: float64

In [176]:
X = pd.concat([train_preprocessed, df['assessment'], y], axis=1)
X.head()


Unnamed: 0,text,assessment,labels
0,"[маленьк, выбор, товар, хотел, ассортимент, врод]",6.0,[28]
1,[быстр],4.0,[18]
2,"[доставк, постоя, задержива]",6.0,[3]
3,"[наценк, ассортимент, расстраива]",6.0,"[12, 28]"
4,"[немн, скинут, минимальн, сумм, заказ, оплачив...",6.0,[8]


In [177]:
X_text = pd.concat([test_preprocessed,df_test['assessment']], axis=1)
X_text.head()

Unnamed: 0,text,assessment
0,"[последн, врем, дума, плох, срок, доставк, дав...",3.0
1,"[цен, намн, выш, магазин, рад, акц]",2.0
2,"[доставк, num, минут, заказ, нача, собира, спу...",2.0
3,"[ужасн, долг, доставк]",0.0
4,"[добр, вечер, больш, молодц, все, устраива, ис...",6.0


In [189]:
from sklearn.model_selection import train_test_split

X_train, X_val = train_test_split(X, train_size= 0.8, random_state = 42)

In [190]:
X_val.shape

(925, 3)

In [191]:
y_val.shape

(925,)

In [123]:
!pip install torchtext



In [180]:
for _, sample in X_train.iterrows():
    print(sample.to_list()[0])
    break

['num', 'ч', 'num', 'мин', 'ещ', 'доставля', 'заказ', 'отвратительн', 'одн', 'слов']


In [181]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(df):
    for _, sample in X_train.iterrows():
        yield sample.to_list()[0]


# Define special symbols and indices
#UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
#special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

vocab = ''
vocab = build_vocab_from_iterator(yield_tokens(X))


In [182]:
len(vocab)

3831

In [198]:
import torch
from torch.utils.data import DataLoader
txt = ''
torch.manual_seed(420)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
def collate_batch(batch):
    label_list, text_list, score_list, offsets = [], [], [], [0]
    cnt = 0
    for i in range(len(batch)):
        _score, _text, _label = batch[i][1], batch[i][0], batch[i][2]
        global txt
        label_list.append(_label)
        txt = _text
        processed_text = torch.tensor(vocab(_text), dtype=torch.int64)
        text_list.append(processed_text)
        score_list.append(_score)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    score_list = torch.tensor(score_list, dtype=torch.float64)
    
    return label_list.to(device), text_list.to(device), offsets.to(device), score_list.to(device)

train_dataloader = DataLoader(
    X_train.to_numpy(), batch_size=32, shuffle=True, collate_fn=collate_batch
)

val_dataloader = DataLoader(
    X_val.to_numpy(), batch_size=32, shuffle=True, collate_fn=collate_batch
)

cpu


In [199]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.nn as nn

class Neuraln(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super().__init__()
        # Direct LSTM initialization without embedding layer
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, num_layers=1)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, text, h=None, c=None):
        out = 0
        if h is not None and c is not None:
            out, (h, c) = self.lstm(text, (h, c))
        else:
            out, (h, c) = self.lstm(text)

        x = self.fc(out)
        return x[1]  # Assuming the use of the second time step output as before


class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, padding_idx=1):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim,
            padding_idx=padding_idx        )
        self.fc = nn.Linear(embed_dim, num_classes)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        """
        Args:
            text: Tensor of token indices concatenated for the batch
            offsets: Tensor indicating the start index of each sequence in `text`
        Returns:
            Logits for each class
        """
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)


In [200]:
# Parameters
vocab_size = len(vocab)
embed_dim = 128
num_classes = 50

# Instantiate the model
#model = LSTMTextClassificationModel(vocab_size, embed_dim, hidden_dim = 128, num_classes = num_classes)
model = TextClassificationModel(vocab_size, embed_dim, num_classes)
#model = Neuraln(vocab_size, embed_dim=128, hidden_dim = 8, num_classes = num_classes)
# Example input
text = torch.tensor([vocab[token] for token in ["a", "simple"]], dtype=torch.long)
offsets = torch.tensor([0], dtype=torch.long)  # Single sequence starting at index 0

# Forward pass
logits = model(text, offsets)
print(logits)


RuntimeError: Token a not found and default index is not set

In [201]:
from tqdm.autonotebook import tqdm
import torch

def train_one_epoch(
    model,
    loader,
    optimizer,
    loss_fn,
    epoch_num=-1,
    device='cuda'
):
    """
    Trains the model for one epoch.

    Args:
        model (nn.Module): The neural network model.
        loader (DataLoader): DataLoader for training data.
        optimizer (torch.optim.Optimizer): Optimizer for training.
        loss_fn (nn.Module): Loss function.
        epoch_num (int, optional): Current epoch number for display. Defaults to -1.
        device (str, optional): Device to run training on. Defaults to 'cpu'.

    Returns:
        float: Average training loss for the epoch.
    """
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch_num}: train",
        leave=True,
    )
    model.train()  # Set model to training mode
    train_loss = 0.0
    correct = 0
    total = 0

    for i, batch in loop:
        # Unpack the batch
        labels, texts, offsets, scores = batch

        # Move data to the specified device
        labels = labels.to(device)
        texts = texts.to(device)
        offsets = offsets.to(device)
        # If 'scores' and 'helpfulness' are used, move them as well
        scores = scores.to(device) if isinstance(scores, torch.Tensor) else scores
        
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        # Assuming the model returns logits for classification

        outputs = model(texts, offsets)
        #outputs = outputs.unsqueeze(0)

        loss = loss_fn(outputs, labels)

        # Backward pass
        loss.backward()


        # Optimizer step
        optimizer.step()

        # Accumulate loss
        train_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # Update progress bar
        loop.set_postfix({"loss": train_loss / total, "acc": correct / total})

    avg_loss = train_loss / total
    avg_acc = correct / total
    print(f"Epoch {epoch_num} Training Loss: {avg_loss:.4f}, Accuracy: {avg_acc:.4f}")
    return avg_loss, avg_acc

def val_one_epoch(
    model,
    loader,
    loss_fn,
    epoch_num=-1,
    best_so_far=0.0,
    ckpt_path='best.pt',
    device='cuda'
):
    """
    Validates the model for one epoch.

    Args:
        model (nn.Module): The neural network model.
        loader (DataLoader): DataLoader for validation data.
        loss_fn (nn.Module): Loss function.
        epoch_num (int, optional): Current epoch number for display. Defaults to -1.
        best_so_far (float, optional): Best validation accuracy so far. Defaults to 0.0.
        ckpt_path (str, optional): Path to save the best model checkpoint. Defaults to 'best.pt'.
        device (str, optional): Device to run validation on. Defaults to 'cpu'.

    Returns:
        float: Updated best validation accuracy.
    """
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch_num}: val",
        leave=True,
    )

    val_loss = 0.0
    correct = 0
    total = 0
    model.eval()  # Set model to evaluation mode

    with torch.no_grad():
        for i, batch in loop:
            # Unpack the batch
            labels, texts, offsets, scores= batch

            # Move data to the specified device
            labels = labels.to(device)
            texts = texts.to(device)
            offsets = offsets.to(device)
            # If 'scores' and 'helpfulness' are used, move them as well
            scores = scores.to(device) if isinstance(scores, torch.Tensor) else scores
        

            # Forward pass
            outputs = model(texts, offsets)
            #outputs = outputs.unsqueeze(0)
            # Calculate loss
            loss = loss_fn(outputs, labels)

            # Accumulate loss
            val_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            # Update progress bar
            loop.set_postfix({"loss": val_loss / total, "acc": correct / total})

    avg_loss = val_loss / total
    avg_acc = correct / total
    print(f"Epoch {epoch_num} Validation Loss: {avg_loss:.4f}, Accuracy: {avg_acc:.4f}")

    # Save the model if validation accuracy improved
    if avg_acc > best_so_far:
        best_so_far = avg_acc
        torch.save(model.state_dict(), ckpt_path)
        print(f"New best model saved with accuracy: {best_so_far:.4f}")

    return best_so_far


In [202]:
import torch.optim as optim
epochs = 15  # Set the number of training epochs
model = TextClassificationModel(vocab_size=len(vocab), embed_dim=128, num_classes=50).to(device)
#model = Neuraln(vocab_size = len(vocab), embed_dim=128, hidden_dim = 8, num_classes = len(train_preprocessed['Category'].unique())).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

loss_fn = nn.CrossEntropyLoss()  # Loss function for multi-class classification

In [203]:
best = -float('inf')
for epoch in range(epochs):
    train_one_epoch(model, train_dataloader, optimizer, loss_fn, epoch_num=epoch)
    best = val_one_epoch(model, val_dataloader, loss_fn, epoch, best_so_far=best)

Epoch 0: train:   0%|          | 0/116 [00:00<?, ?it/s]

ValueError: expected sequence of length 2 at dim 1 (got 4)

In [None]:
#prediction
def collate_batch(batch):
    label_list, text_list, score_list, offsets = [], [], [], [0]
    for i in range(len(batch)):
        _score, _text = batch[i][1], batch[i][0], batch[i][2]
        processed_text = torch.tensor(vocab(_text), dtype=torch.int64)
        text_list.append(processed_text)
        score_list.append(_score)
        
        offsets.append(processed_text.size(0))
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    score_list = torch.tensor(score_list, dtype=torch.float64)
    


    return text_list.to(device), offsets.to(device), score_list.to(device)

test_dataloader = DataLoader(
    test_preprocessed.to_numpy(), batch_size=1, shuffle=False, collate_fn=collate_batch
)


In [None]:
def predict(
    model,
    loader,
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc="Predictions:",
        leave=True,
    )
    predictions = []
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            texts, offsets, scores, helpfulness = batch

            # forward pass and loss calculation
            outputs = model(texts, offsets)
            #outputs = outputs.unsqueeze(0)
            _, predicted = torch.max(outputs.data, 1)
            predictions += predicted.detach().cpu().tolist()

    return predictions

In [None]:
predictions = predict(model, test_dataloader)
len(predictions)

In [13]:
cross_valid = cross_validate(pipeline_multiout, 
                             X_train, y_train, 
                             cv = 5, scoring = ["accuracy"], n_jobs = -1)
print("test_accuracy:", cross_valid["test_accuracy"].mean())

test_accuracy: 0.23363639688402885


In [14]:
y_pred = cross_val_predict(pipeline_multiout, X_train, y_train, cv = 2)

In [15]:
# Посмотрим на различные метрики
print(classification_report(y_train, y_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.88      0.41      0.56       661
           1       0.50      0.01      0.01       270
           2       0.76      0.31      0.45       486
           3       0.90      0.19      0.31       289
           4       0.00      0.00      0.00       108
           5       0.00      0.00      0.00        44
           6       0.00      0.00      0.00        16
           7       0.00      0.00      0.00        27
           8       0.00      0.00      0.00       109
           9       0.00      0.00      0.00         9
          10       0.00      0.00      0.00        76
          11       0.00      0.00      0.00        87
          12       0.95      0.51      0.67       491
          13       0.00      0.00      0.00        29
          14       0.00      0.00      0.00        62
          15       0.00      0.00      0.00        66
          16       0.00      0.00      0.00       166
          17       0.00    

In [16]:
# Посмотрим на целевую метрику
accuracy_score(y_train, y_pred)

0.19226608977825851

###  Тренировка окончательной модели

In [17]:
pipeline_multiout.fit(X_train, y_train)

##  Предсказание и загрузка решения

In [18]:
pred_test = pipeline_multiout.predict(df_test[["text"]].astype("str"))

In [19]:
res = pd.DataFrame(np.hstack([df_test["index"].values.reshape(df_test.shape[0], 1), pred_test]),
                  columns = ["index"]+[f"{i}" for i in range(50)])

In [26]:
type(res['index'][1])

numpy.int64

In [20]:
res.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,3135,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4655,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22118,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,23511,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,45,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:

res["target"] = res.drop('index',axis=1).apply(lambda r: r.index[r.ne(0)].to_list(), axis=1)


res

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,41,42,43,44,45,46,47,48,49,target
0,3135,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
1,4655,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[12]
2,22118,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[2]
3,23511,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[0]
4,45,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9010,3523,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
9011,24925,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
9012,6327,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
9013,530,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]


In [22]:
f=open('submission.csv', 'w')
f.write("index,target")
f.write("\n")
for i in range(len(res)):
    
    f.write(str(res['index'][i]))
    f.write(',')
    for x in res['target'][i]:
        f.write(str(x))
        f.write(' ')
    f.write("\n")

f.close()

In [23]:
res.iloc[:, 1:].sum()

0                                                       706
1                                                        87
2                                                       718
3                                                       243
4                                                         0
5                                                         0
6                                                         0
7                                                         0
8                                                         0
9                                                         0
10                                                        0
11                                                       29
12                                                      831
13                                                        0
14                                                        0
15                                                        0
16                                      

In [24]:
res["0"].value_counts()

0
0    8309
1     706
Name: count, dtype: int64

In [25]:
type(res['index'][1])

numpy.int64