In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import (
    f1_score, 
    accuracy_score,
    classification_report, 
)

In [2]:
train_data = pd.read_csv("../../data/original/train.csv.csv", index_col=0)
assessment = train_data['assessment']

text = pd.read_csv("../../data/processed/text_df.csv", index_col=0)
tags = pd.read_csv("../../data/processed/tags_df.csv", index_col=0)
targets = pd.read_csv("../../data/processed/target_df.csv", index_col=0)

In [3]:
text = text.fillna("")

In [7]:
import torch
from transformers import AutoModel, AutoTokenizer

In [8]:
rubert = AutoModel.from_pretrained("cointegrated/rubert-tiny")
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")

In [9]:
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [13]:
train_embeddings = pd.DataFrame(columns=range(312))
for i, text in enumerate(text['text']):
    train_embeddings.loc[i] = embed_bert_cls(text, rubert, tokenizer)
train_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,302,303,304,305,306,307,308,309,310,311
0,0.090183,0.061351,-0.065949,-0.099538,0.01596,0.069783,-0.066221,0.006863,-0.121204,-0.015498,...,0.00703,-0.034012,0.00774,0.014849,0.04782,0.027915,-0.000539,-0.001336,0.016548,-0.071149
1,-0.021558,0.00223,0.016219,-0.051915,-0.026662,0.040939,0.061662,-0.01569,-0.009321,-0.069248,...,0.01026,0.009149,-0.021632,-0.024911,-0.001464,-0.03434,-0.053662,0.107122,0.102947,-0.071786
2,0.016584,-0.033119,-0.023893,-0.068441,0.04383,0.015474,0.01301,-0.007507,0.006783,-0.09778,...,-0.036983,0.089537,0.04848,0.039076,0.023839,-0.048955,-0.057562,0.104591,0.140778,0.015352
3,0.013997,0.012985,-0.005958,-0.062241,0.006339,0.007202,-0.035187,0.022503,0.002928,-0.024576,...,0.010641,0.13041,-0.03453,0.018799,0.082025,-0.061748,-0.042123,0.059411,0.053553,-0.033946
4,0.017832,0.000991,-0.070902,-0.108358,0.006684,0.037815,-0.020891,-0.015415,-0.015879,0.004351,...,0.021067,-0.118142,-0.056713,-0.011428,-0.001978,0.040969,0.091797,0.002272,0.000739,-0.036165


In [14]:
final_train_data_1 = pd.concat([pd.DataFrame(assessment.values), pd.DataFrame(train_embeddings.values)], axis=1).values
final_train_data_2 = pd.concat([pd.DataFrame(assessment.values), pd.DataFrame(tags.values), pd.DataFrame(train_embeddings.values)], axis=1).values

In [17]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC

In [19]:
model_1 = MultiOutputClassifier(SVC())
model_1.fit(final_train_data_1, targets.values)

In [20]:
model_2 = MultiOutputClassifier(SVC())
model_2.fit(final_train_data_2, targets.values)

In [25]:
test_data = pd.read_csv("../../data/original/test.csv.csv", index_col=0)
test_assessment = test_data['assessment']

test_data.head()

Unnamed: 0,index,assessment,tags,text
1,3135,3.0,{DELIVERY},"Последнее время думаю плохо, сроки доставки да..."
3,4655,2.0,"{PRICE,DELIVERY,ASSORTMENT}",Цены намного выше магазинных но радуют акции
5,22118,2.0,"{CATALOG_NAVIGATION,ASSORTMENT,DELIVERY}","Доставка за [NUM] минут, заказ даже не начали ..."
7,23511,0.0,{DELIVERY},Ужасно долгая доставка
8,45,6.0,"{ASSORTMENT,PROMOTIONS}",Добрый вечер! Вы большие молодцы. Меня всё уст...


In [27]:
import re
from bs4 import BeautifulSoup


def preprocessor(text):
    
    text = BeautifulSoup(text, "html.parser").get_text() # Удаляем HTML
    text = re.sub(r'http\S+', '', text)  # Удаление ссылок
    text = re.sub(r'\d+', '', text)  # Удаление номеров
    text = re.sub(r'\s+', ' ', text) # удаление лишних пробелов
    text = re.sub(r'[^a-zA-Zа-яА-Я0-9\.,?!\s]', '', text) # Удаление всех символов кроме значимых(буквы, пунктуация)
    text = text.lower()
    
    return text

In [29]:
test_text = pd.DataFrame(test_data['text'].values, columns=['text'])
test_text = test_text.fillna("")
test_text['text'] = test_text['text'].apply(preprocessor)
test_text.head()

  text = BeautifulSoup(text, "html.parser").get_text() # Удаляем HTML


Unnamed: 0,text
0,"последнее время думаю плохо, сроки доставки да..."
1,цены намного выше магазинных но радуют акции
2,"доставка за num минут, заказ даже не начали со..."
3,ужасно долгая доставка
4,добрый вечер! вы большие молодцы. меня вс устр...


In [31]:
test_embeddings = pd.DataFrame(columns=range(312))
for i, text in enumerate(test_text['text']):
    test_embeddings.loc[i] = embed_bert_cls(text, rubert, tokenizer)
test_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,302,303,304,305,306,307,308,309,310,311
0,0.057307,-0.004153,-0.07963,-0.080684,0.006273,-0.01297,-0.014975,-0.056541,0.030897,-0.0222,...,0.001296,-0.033419,0.007644,-0.01676,0.072395,-0.021912,0.007119,-0.058711,0.040889,-0.071289
1,0.023263,-0.01383,-0.075212,-0.063986,-0.013203,-0.011396,-0.053554,-0.031162,-0.063725,-0.025835,...,0.009899,0.064229,-0.07807,-0.027313,0.048697,-0.002139,0.032543,0.013602,0.024171,-0.090221
2,-0.020465,0.021005,-0.070391,-0.012247,-0.004633,0.027047,-0.0276,-0.135798,0.063297,-0.011361,...,0.001348,-0.010913,-0.061895,-0.031841,-0.004156,-0.013444,0.052412,0.053872,0.054484,-0.005813
3,-0.014342,-0.057608,0.011022,-0.058035,0.032176,0.057408,0.078957,0.005939,-0.00053,-0.117301,...,0.021731,0.106528,-0.005384,-0.030631,0.043506,-0.014857,-0.096142,0.095758,0.109256,-0.07021
4,0.063125,0.038107,-0.051438,-0.05392,0.014377,0.018012,-0.021769,-0.043809,-0.010061,-0.026171,...,-0.002401,-0.001011,-0.040269,-0.004657,0.011101,-0.007533,0.024891,0.045647,0.063856,-0.029596


In [32]:
final_test_data_1 = pd.concat([pd.DataFrame(test_assessment.values), pd.DataFrame(test_embeddings)], axis=1).values

In [35]:
prediction_1 = model_1.predict(final_test_data_1)

In [36]:
def get_class_labels(array):
    
  class_labels = []
  for i in range(array.shape[0]):
    classes = np.where(array[i] == 1)[0]
    if len(classes) > 0:
      class_labels.append(" ".join(str(x) for x in classes))
    else:
      class_labels.append("")


  df = pd.DataFrame({"class_labels": class_labels})

  return df

answer_1 = get_class_labels(prediction_1)

In [39]:
answer_1 = pd.DataFrame(answer_1.values, index=test_data['index'].values, columns=['target'])
answer_1.to_csv('../../results/submissions/2-1.csv')

In [43]:
answer_1[answer_1['target']!='']

Unnamed: 0,target
20516,2
4275,0
21056,19
15438,19
22224,19
...,...
23453,0
3783,19
7869,12
8150,12
