In [161]:
import pathlib
import random
import pandas as pd
import numpy as np
import sys
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict

from sklearn.metrics import (
    f1_score, 
    accuracy_score,
    classification_report, 
)

ROOT_DIR = pathlib.Path().absolute()
DATA_DIR = ROOT_DIR / "data"
RANDOM_SEED = 42

## –ó–∞–≥—Ä—É–∑–∫–∞ –∏ –æ–±–∑–æ—Ä –¥–∞–Ω–Ω—ã—Ö

In [162]:
df_trends = pd.read_csv(DATA_DIR / "trends_description.csv")
df = pd.read_csv(DATA_DIR / "train.csv")
df_test = pd.read_csv(DATA_DIR / "test.csv")

In [163]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","–ú–∞–ª–µ–Ω—å–∫–∏–π –≤—ã–±–æ—Ä —Ç–æ–≤–∞—Ä–æ–≤, —Ö–æ—Ç–µ–ª–æ—Å—å –±—ã –∞—Å—Å–æ—Ä—Ç–∏–º–µ...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",–ë—ã—Å—Ç—Ä–æ,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",–î–æ—Å—Ç–∞–≤–∫–∞ –ø–æ—Å—Ç–æ—è–Ω–Ω–æ –∑–∞–¥–µ—Ä–∂–∏–≤–∞–µ—Ç—Å—è,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",–ù–∞—Ü–µ–Ω–∫–∞ –∏ –∞—Å—Å–æ—Ä—Ç–∏–º–µ–Ω—Ç —Ä–∞—Å—Å—Ç—Ä–∞–∏–≤–∞—é—Ç,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1428,6.0,"{PRICE,PROMOTIONS}",–ú–æ–∂–Ω–æ –Ω–µ–º–Ω–æ–≥–æ —Å–∫–∏–Ω—É—Ç—å –º–∏–Ω–∏–º–∞–ª—å–Ω—É—é —Å—É–º–º—É –∑–∞–∫–∞–∑–∞...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–µ–π

### –ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö

In [164]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","–ú–∞–ª–µ–Ω—å–∫–∏–π –≤—ã–±–æ—Ä —Ç–æ–≤–∞—Ä–æ–≤, —Ö–æ—Ç–µ–ª–æ—Å—å –±—ã –∞—Å—Å–æ—Ä—Ç–∏–º–µ...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",–ë—ã—Å—Ç—Ä–æ,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",–î–æ—Å—Ç–∞–≤–∫–∞ –ø–æ—Å—Ç–æ—è–Ω–Ω–æ –∑–∞–¥–µ—Ä–∂–∏–≤–∞–µ—Ç—Å—è,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",–ù–∞—Ü–µ–Ω–∫–∞ –∏ –∞—Å—Å–æ—Ä—Ç–∏–º–µ–Ω—Ç —Ä–∞—Å—Å—Ç—Ä–∞–∏–≤–∞—é—Ç,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1428,6.0,"{PRICE,PROMOTIONS}",–ú–æ–∂–Ω–æ –Ω–µ–º–Ω–æ–≥–æ —Å–∫–∏–Ω—É—Ç—å –º–∏–Ω–∏–º–∞–ª—å–Ω—É—é —Å—É–º–º—É –∑–∞–∫–∞–∑–∞...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [165]:
X, y = df[["text"]], df[[f"trend_id_res{i}" for i in range(50)]]
X = X.astype("str").copy()
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size= 0.8, random_state = 42)
print(f"X_train.shape is {X_train.shape}")
print(f"y_train.shape is {y_train.shape}")
print(f"X_val.shape is {X_val.shape}")
print(f"y_val.shape is {y_val.shape}")
X_test = df_test[["text"]]
print(f"X_test.shape is {X_test.shape}")
trend_info = pd.read_csv(DATA_DIR / "trends_description.csv")


X_train.shape is (3698, 1)
y_train.shape is (3698, 50)
X_val.shape is (925, 1)
y_val.shape is (925, 50)
X_test.shape is (9015, 1)


In [93]:

categories = []
for i in trend_info['trend']:
    categories.append(i)
categories

['–î–æ–ª–≥–∞—è –¥–æ—Å—Ç–∞–≤–∫–∞',
 '–î–æ—Å—Ç–∞–≤–∫–∞ —Å—Ç–∞–ª–∞ –¥–æ–ª–≥–æ–π',
 '–í—Ä–µ–º—è –¥–æ—Å—Ç–∞–≤–∫–∏ –Ω–µ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É–µ—Ç –∑–∞—è–≤–ª–µ–Ω–æ–º—É',
 '–†–µ–≥—É–ª—è—Ä–Ω—ã–µ –æ–ø–æ–∑–¥–∞–Ω–∏—è',
 '–ù–µ –æ—Ç—Å–ª–µ–¥–∏—Ç—å —Ä–µ–∞–ª—å–Ω–æ–µ –≤—Ä–µ–º—è –¥–æ—Å—Ç–∞–≤–∫–∏',
 '–ö—É—Ä—å–µ—Ä –Ω–∞ –∫–∞—Ä—Ç–µ',
 '–ù–µ—Ç –¥–æ—Å—Ç–∞–≤–∫–∏ –ø–æ –∞–¥—Ä–µ—Å—É',
 '–ù–µ –ø—Ä–µ–¥—É–ø—Ä–µ–∂–¥–∞–µ–º –æ–± —É–¥–∞–ª–µ–Ω–∏–∏ —Ç–æ–≤–∞—Ä–∞',
 '–í—ã—Å–æ–∫–∞—è –º–∏–Ω–∏–º–∞–ª—å–Ω–∞—è —Å—É–º–º–∞ –∑–∞–∫–∞–∑–∞',
 '–°—É–º–º–∞ –∑–∞–∫–∞–∑–∞ –º–µ–Ω—è–µ—Ç—Å—è –≤–æ –≤—Ä–µ–º—è –Ω–∞–±–æ—Ä–∞ –∫–æ—Ä–∑–∏–Ω—ã',
 '–ú–∏–Ω–∏–º–∞–ª—å–Ω–∞—è —Å—É–º–º–∞ –∑–∞–∫–∞–∑–∞',
 '–¢–æ–≤–∞—Ä—ã —Å –ø–æ–¥—Ö–æ–¥—è—â–∏–º —Å—Ä–æ–∫–æ–º –≥–æ–¥–Ω–æ—Å—Ç–∏',
 '–í—ã—Å–æ–∫–∏–µ —Ü–µ–Ω—ã',
 '–ù–µ –¥–æ–≤–µ–∑–ª–∏ —Ç–æ–≤–∞—Ä',
 '–¢–æ–≤–∞—Ä –∏—Å–ø–æ—Ä—á–µ–Ω –≤–æ –≤—Ä–µ–º—è –¥–æ—Å—Ç–∞–≤–∫–∏',
 '–ü—Ä–æ—Å—Ä–æ—á–µ–Ω–Ω—ã–µ —Ç–æ–≤–∞—Ä—ã',
 '–ó–∞–º–µ—á–∞–Ω–∏—è –ø–æ —Ä–∞–±–æ—Ç–µ –∫—É—Ä—å–µ—Ä–æ–≤',
 '–ù–µ —á–∏—Ç–∞–µ–º –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–∏',
 '–°–ø–∞—Å–∏–±–æ',


In [94]:
X_train

Unnamed: 0,text
1538,"–ù—É, –∑–∞ [NUM]—á. –∏ [NUM] –º–∏–Ω. –º–Ω–µ –µ—â—ë –Ω–∏–∫–æ–≥–¥–∞ –Ω–µ..."
2991,–î–æ—Å—Ç–∞–≤–∫–∞ –≤—Å–µ–≥–¥–∞ –æ—Å—É—â–µ—Å—Ç–≤–ª—è–µ—Ç—Å—è –∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω–æ –¥–æ–ª...
2812,–ó–∞–¥–µ—Ä–∂–∫–∞ –¥–æ—Å—Ç–∞–≤–∫–∏
4515,"–û—Ç–ª–∏—á–Ω—ã–π —Å–µ—Ä–≤–∏—Å, —Ç–æ–ª—å–∫–æ –±—ã –∞—Å—Å–æ—Ä—Ç–∏–º–µ–Ω—Ç —Ä–∞—Å—à–∏—Ä–∏..."
4531,"–ü–æ–¥–¥–µ—Ä–∂–∫–∞ –≥–æ–≤–Ω–æ, –∫—É—Ä—å–µ—Ä—ã –æ–ø–∞–∑–¥—ã–≤–∞—é—Ç –º–∏–Ω—É—Ç –Ω–∞ [..."
...,...
4426,"+ –±—ã—Å—Ç—Ä–æ. - –∏–Ω–æ–≥–¥–∞ —Å—É–º–º–∞ –∑–∞–∫–∞–∑–∞ –æ—á–µ–Ω—å –≤–µ–ª–∏–∫–∞, ..."
466,ü¶â
3092,–î–æ —Å–∞–º–æ–∫–∞—Ç–∞ —è —Ç—Ä–∞—Ç–∏–ª –º–µ–Ω—å—à–µ –¥–µ–Ω–µ–≥ –≤ –¥–µ–Ω—å
3772,–û –≤–∞—Å —Ä–µ–¥–∫–æ –¥—É–º–∞—é. –ù–∞–ø—Ä—è–≥–∞–µ—Ç –°–¢–ú. –ö–æ–≥–¥–∞ –Ω–µ–ø–æ–Ω—è...


In [98]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), lowercase=False,  min_df=0.02,max_features=1000)
vectors=vectorizer.fit_transform(X_train['text'])
vectors.shape

(3698, 99)

In [100]:
vectorizer.get_feature_names_out()


array(['NUM', 'NUM NUM', 'NUM NUM –º–∏–Ω—É—Ç', 'NUM –º–∏–Ω', 'NUM –º–∏–Ω—É—Ç', '–í—Å–µ',
       '–î–æ–ª–≥–æ', '–î–æ—Å—Ç–∞–≤–∫–∞', '–ö—É—Ä—å–µ—Ä—ã', '–ù–µ', '–û—á–µ–Ω—å', '–û—á–µ–Ω—å –¥–æ–ª–≥–∞—è',
       '–û—á–µ–Ω—å –¥–æ–ª–≥–∞—è –¥–æ—Å—Ç–∞–≤–∫–∞', '–û—á–µ–Ω—å –¥–æ–ª–≥–æ', '–†–∞–Ω—å—à–µ', '–¶–µ–Ω—ã', '–ß–∞—Å—Ç–æ',
       '–∞—Å—Å–æ—Ä—Ç–∏–º–µ–Ω—Ç', '–±–æ–ª—å—à–µ', '–±—É–¥–µ—Ç', '–±—ã', '–±—ã–ª–æ', '–≤–∞—Å', '–≤—Ä–µ–º–µ–Ω–∏',
       '–≤—Ä–µ–º—è', '–≤—Ä–µ–º—è –¥–æ—Å—Ç–∞–≤–∫–∏', '–≤—Å–µ', '–≤—Å–µ–≥–¥–∞', '–≤—ã', '–≤—ã—à–µ',
       '–≥–æ–¥–Ω–æ—Å—Ç–∏', '–¥–æ', '–¥–æ–ª–≥–∞—è', '–¥–æ–ª–≥–∞—è –¥–æ—Å—Ç–∞–≤–∫–∞', '–¥–æ–ª–≥–æ', '–¥–æ–ª—å—à–µ',
       '–¥–æ—Å—Ç–∞–≤–∫–∞', '–¥–æ—Å—Ç–∞–≤–∫–∏', '–¥–æ—Å—Ç–∞–≤–∫—É', '–¥–æ—Å—Ç–∞–≤–ª—è—é—Ç', '–µ—Å–ª–∏', '–∂–¥–∞—Ç—å',
       '–∂–µ', '–∑–∞', '–∑–∞ NUM', '–∑–∞ NUM –º–∏–Ω—É—Ç', '–∑–∞–∫–∞–∑', '–∑–∞–∫–∞–∑–∞', '–∑–∞–∫–∞–∑—ã',
       '–∏–∑', '–∏–ª–∏', '–∏–Ω–æ–≥–¥–∞', '–∫–∞–∫', '–∫–∞—á–µ—Å—Ç–≤–æ', '–∫–æ–≥–¥–∞', '–∫—É—Ä—å–µ—Ä',
       '–∫—É—Ä—å–µ—Ä—ã', '–ª—É—á—à–µ', '

In [78]:
dense_vectors = vectors.todense()
dense_vectors.shape

(3698, 99)

In [195]:
from sklearn.svm import SVC
preprocessor = ColumnTransformer(
    [
        ("vetorizer", TfidfVectorizer(analyzer="char_wb", ngram_range = (1,3), lowercase=False,max_features=1000), "text")
    ],                         
    remainder = "passthrough"
)

pipeline_multiout = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("clf", MultiOutputClassifier(SVC(kernel='rbf', gamma=0.1, C=2))),
    ]
)
display(pipeline_multiout)

In [196]:
cross_valid = cross_validate(pipeline_multiout, 
                             X_train, y_train, 
                             cv = 5, scoring = ["accuracy"], n_jobs = -1)
print("test_accuracy:", cross_valid["test_accuracy"].mean())

test_accuracy: 0.23363639688402885


In [197]:
y_pred = cross_val_predict(pipeline_multiout, X_train, y_train, cv = 2)

In [198]:
# –ü–æ—Å–º–æ—Ç—Ä–∏–º –Ω–∞ —Ä–∞–∑–ª–∏—á–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏
print(classification_report(y_train, y_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.88      0.41      0.56       661
           1       0.50      0.01      0.01       270
           2       0.76      0.31      0.45       486
           3       0.90      0.19      0.31       289
           4       0.00      0.00      0.00       108
           5       0.00      0.00      0.00        44
           6       0.00      0.00      0.00        16
           7       0.00      0.00      0.00        27
           8       0.00      0.00      0.00       109
           9       0.00      0.00      0.00         9
          10       0.00      0.00      0.00        76
          11       0.00      0.00      0.00        87
          12       0.95      0.51      0.67       491
          13       0.00      0.00      0.00        29
          14       0.00      0.00      0.00        62
          15       0.00      0.00      0.00        66
          16       0.00      0.00      0.00       166
          17       0.00    

In [199]:
# –ü–æ—Å–º–æ—Ç—Ä–∏–º –Ω–∞ —Ü–µ–ª–µ–≤—É—é –º–µ—Ç—Ä–∏–∫—É
accuracy_score(y_train, y_pred)

0.19226608977825851

### ¬†–¢—Ä–µ–Ω–∏—Ä–æ–≤–∫–∞ –æ–∫–æ–Ω—á–∞—Ç–µ–ª—å–Ω–æ–π –º–æ–¥–µ–ª–∏

In [200]:
pipeline_multiout.fit(X_train, y_train)

## ¬†–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –∏ –∑–∞–≥—Ä—É–∑–∫–∞ —Ä–µ—à–µ–Ω–∏—è

In [201]:
pred_test = pipeline_multiout.predict(df_test[["text"]].astype("str"))

In [202]:
res = pd.DataFrame(np.hstack([df_test["index"].values.reshape(df_test.shape[0], 1), pred_test]),
                  columns = ["index"]+[f"{i}" for i in range(50)])

In [203]:
res.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,3135,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4655,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22118,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,23511,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,45,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [204]:

res["target"] = res.drop('index',axis=1).apply(lambda r: r.index[r.ne(0)].to_list(), axis=1)


res

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,41,42,43,44,45,46,47,48,49,target
0,3135,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
1,4655,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[12]
2,22118,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[2]
3,23511,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[0]
4,45,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9010,3523,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
9011,24925,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
9012,6327,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]
9013,530,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,[]


In [205]:
f=open('submission.csv', 'w')
f.write("index,target")
f.write("\n")
for i in range(len(res)):
    
    f.write(str(res['index'][i]))
    f.write(',')
    for x in res['target'][i]:
        f.write(str(x))
        f.write(' ')
    f.write("\n")

f.close()

In [18]:
res.iloc[:, 1:].sum()

0                                                       750
1                                                       174
2                                                       706
3                                                       222
4                                                         0
5                                                         0
6                                                         0
7                                                         0
8                                                        23
9                                                         0
10                                                        0
11                                                       30
12                                                      743
13                                                        0
14                                                        0
15                                                        5
16                                      

In [19]:
res["0"].value_counts()

0
0    8265
1     750
Name: count, dtype: int64