In [1]:
from gensim.models.fasttext import FastText
from gensim.models import Word2Vec
from catboost import Pool, CatBoostClassifier
import pandas as pd
import re
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report,accuracy_score
import numpy as np



In [2]:
train_df = pd.read_csv("data/train_supervised_dataset.csv").fillna("")
test_df = pd.read_csv("data/test_dataset.csv")

In [3]:
test_df.head()

Unnamed: 0,id,name
0,0,"469-210 ЕРМАК Клей универсальный, 15мл, блистер"
1,1,Торт СЛАДУШКА Зимняя вишня 700г
2,2,"Смеситель ""CALORIE"" 1023 А06 д/кухни"
3,3,Лимон 50гр БАР
4,4,"Коньяк САРАДЖИШВИЛИ 5 лет 0,5л Грузия"


In [4]:
train_df.head()

Unnamed: 0,id,name,good,brand
0,0,Petmax Бантик леопард с красн розой 2шт,бантик,petmax
1,1,87191 Бусы для елки шарики_87191,бусы,
2,2,Футболка Piazza Italia WR011446881,футболка,piazza italia
3,3,7) YI572-03X-ONE ЗАКОЛКА ДЛЯ ВОЛОС ДЛЯ ДЕВОЧКИ,заколка,
4,4,Одежда (вес) 1500,одежда,


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      25000 non-null  int64 
 1   name    25000 non-null  object
 2   good    25000 non-null  object
 3   brand   25000 non-null  object
dtypes: int64(1), object(3)
memory usage: 781.4+ KB


In [6]:
nu_brand = train_df["brand"].nunique()
nu_good = train_df["good"].nunique()

print(f"Количество неуникальных брендов: {nu_brand}")
print(f"Количество неуникальных товаров: {nu_good}")

Количество неуникальных брендов: 6976
Количество неуникальных товаров: 2820


In [7]:
# Создадим функцию очистки текста
def clean_text(text):
    text = text.lower()
    #text = re.sub(r'[^\sa-zA-Z0-9@\[\]]',' ',text) # Удаляет пунктцацию
    text = re.sub(r'\w*\d+\w*', '', text) # Удаляет цифры
    text = re.sub(r'[^\w\s]', ' ', text) # Удаляет знаки
    text = re.sub(r'\b\S{1}\b', '', text) # Удаляет слова из 1-й буквы
    text = re.sub(r'\b\S{2}\b', '', text) # Удаляет слова из 2-х букв
    text = re.sub('\s{2,}', " ", text) # Удаляет ненужные пробелы
    return text

# Применяем ее к тексту
train_df['name'] = train_df['name'].apply(clean_text)

In [8]:
train_df

Unnamed: 0,id,name,good,brand
0,0,petmax бантик леопард красн розой,бантик,petmax
1,1,бусы для елки,бусы,
2,2,футболка piazza italia,футболка,piazza italia
3,3,one заколка для волос для девочки,заколка,
4,4,одежда вес,одежда,
...,...,...,...,...
24995,24995,вода саирме,вода,sairme
24996,24996,моя семя ассортим,,моя семья
24997,24997,рулет бисквитн яшкино клубничный слив,рулет,яшкино
24998,24998,почвогрунт цветочное счастье фаско декоративн...,почвогрунт,фаско


In [14]:
train_df['br'] = train_df.apply(lambda x: x["brand"] + "ы" if(x["brand"] == 'petmax') else 1, axis=1)

In [15]:
train_df

Unnamed: 0,id,name,good,brand,br
0,0,petmax бантик леопард красн розой,бантик,petmax,petmaxы
1,1,бусы для елки,бусы,,1
2,2,футболка piazza italia,футболка,piazza italia,1
3,3,one заколка для волос для девочки,заколка,,1
4,4,одежда вес,одежда,,1
...,...,...,...,...,...
24995,24995,вода саирме,вода,sairme,1
24996,24996,моя семя ассортим,,моя семья,1
24997,24997,рулет бисквитн яшкино клубничный слив,рулет,яшкино,1
24998,24998,почвогрунт цветочное счастье фаско декоративн...,почвогрунт,фаско,1


In [9]:
#names = pd.concat((train_df[["name"]], test_df)).reset_index(drop=True)


In [10]:
train_df["tokens"] = train_df["name"].str.lower().str.split()

In [11]:
def apply_bio_tagging(row):
    """
    По токенам чека и разметке (то есть выделенным товарам и брендам) строим BIO-теги
    """
    tokens = row["tokens"]
    good = row["good"].split(',')[0].split()
    brand = row["brand"].split(',')[0].split()
    tags = ['O'] * len(tokens)
    for i, token in enumerate(tokens):
        if len(good) > 0 and tokens[i:i + len(good)] == good:
            tags[i] = "B-GOOD"
            for j in range(i + 1, i + len(good)):
                tags[j] = "I-GOOD"
        if len(brand) > 0 and tokens[i:i + len(brand)] == brand:
            tags[i] = "B-BRAND"
            for j in range(i + 1, i + len(brand)):
                tags[j] = "I-BRAND"
    return tags

In [12]:
train_df["tags"] = train_df.apply(apply_bio_tagging, axis=1)

In [13]:
train_df.head()

Unnamed: 0,id,name,good,brand,tokens,tags
0,0,petmax бантик леопард красн розой,бантик,petmax,"[petmax, бантик, леопард, красн, розой]","[B-BRAND, B-GOOD, O, O, O]"
1,1,бусы для елки,бусы,,"[бусы, для, елки]","[B-GOOD, O, O]"
2,2,футболка piazza italia,футболка,piazza italia,"[футболка, piazza, italia]","[B-GOOD, B-BRAND, I-BRAND]"
3,3,one заколка для волос для девочки,заколка,,"[one, заколка, для, волос, для, девочки]","[O, B-GOOD, O, O, O, O]"
4,4,одежда вес,одежда,,"[одежда, вес]","[B-GOOD, O]"


In [14]:
tags = []
for tag in train_df['tags'].to_list():
    tags.extend(tag)
print('Entities in our data set')
set(tags)

Entities in our data set


{'B-BRAND', 'B-GOOD', 'I-BRAND', 'I-GOOD', 'O'}

In [15]:
w2v_model = FastText.load("fst.model")

In [16]:

words = list(w2v_model.wv.index_to_key)

In [17]:
words[0:10]

['для',
 'черный',
 'напиток',
 'белый',
 'пиво',
 'набор',
 'арт',
 'вода',
 'сыр',
 'бзмж']

In [18]:
def to_series(column):
    token_list = column.to_list()
    flat_list = [item for sublist in token_list for item in sublist]
    tokens_series = pd.Series(flat_list)
    return tokens_series

In [19]:
def make_datasets(data):
    tags = to_series(train_df["tags"])
    tokenss = to_series(train_df["tokens"])
    dataset = tokenss.apply(lambda x: w2v_model.wv.word_vec(x))
    dtv_train = pd.concat([pd.DataFrame(dataset.to_list()), 
           pd.DataFrame(tags)], axis=1, ignore_index=True)
    dtv_train.rename(columns={300:"labels"}, inplace=True)

    good_train = dtv_train.loc[dtv_train['labels'].isin(["B-GOOD", "I-GOOD", "O"])]
    brand_train = dtv_train.loc[dtv_train['labels'].isin(["B-BRAND", "I-BRAND", "O"])]
    
    
    return good_train, brand_train

In [20]:
good_train, brand_train = make_datasets(train_df)

  dataset = tokenss.apply(lambda x: w2v_model.wv.word_vec(x))


In [21]:
good_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,labels
1,-0.315531,-0.481765,-0.65254,0.463447,-0.122203,-0.480863,-0.137293,-0.003325,-0.419353,-0.667099,...,0.288909,-0.260036,-0.304667,0.510278,-0.332718,-0.203781,0.333719,-0.382257,0.28685,B-GOOD
2,0.016791,-0.180934,-0.818971,-0.219705,0.090976,0.283494,-0.335978,-0.051116,0.031835,0.027362,...,-0.045046,-0.294219,-0.170586,-0.594982,0.060734,0.167994,0.199121,-0.111107,0.212782,O
3,-0.534205,0.190236,-0.244919,-0.081599,-0.028301,-0.480327,-0.667193,0.76296,0.306905,-0.337797,...,-0.062866,-0.007137,0.465703,-0.3719,-0.231565,0.074749,0.122957,0.204606,0.577125,O
4,-0.615371,0.064413,-0.611597,-0.213545,-0.000231,-0.084669,-0.125738,-0.710554,0.326323,0.214496,...,0.571753,-0.572989,0.27339,-0.057201,-0.458437,0.784217,0.139628,-0.054242,-0.579743,O
5,-0.039074,0.198955,0.123389,0.595596,-0.388866,-0.2813,-0.065837,0.157076,-0.674399,-0.68191,...,0.502278,-0.383531,0.193589,-0.285847,0.329078,0.393911,0.632338,-0.315968,-0.144979,B-GOOD


In [22]:
'''
bb_sum = dtv_train[dtv_train["labels"] == "B-GOOD"]
bb = bb_sum["labels"].value_counts()

bi_sum = dtv_train[dtv_train["labels"] == "I-GOOD"]
bi = bi_sum["labels"].value_counts()

o_sum = dtv_train[dtv_train["labels"] == "O"]
o = o_sum["labels"].value_counts()

print(f"B_BRAND: {bb}")
print(f"I_BRAND: {bi}")
print(f"O: {o}")
'''



'\nbb_sum = dtv_train[dtv_train["labels"] == "B-GOOD"]\nbb = bb_sum["labels"].value_counts()\n\nbi_sum = dtv_train[dtv_train["labels"] == "I-GOOD"]\nbi = bi_sum["labels"].value_counts()\n\no_sum = dtv_train[dtv_train["labels"] == "O"]\no = o_sum["labels"].value_counts()\n\nprint(f"B_BRAND: {bb}")\nprint(f"I_BRAND: {bi}")\nprint(f"O: {o}")\n'

In [23]:
#dataframe = dtv_train.reset_index(drop=True)

In [24]:
catboost_params = {
    "learning_rate": 0.035, 
    "n_estimators": 4000, 
    "subsample": 0.075, 
    "max_depth": 6,
    "l2_leaf_reg": 40,
    'task_type': 'GPU',
    'use_best_model': True,
    "verbose": 100,
    #class_weights = class_weights,
    "eval_metric":'TotalF1',
    "bootstrap_type": "Bernoulli",
    "loss_function": 'MultiClass'
    }

In [25]:
def modeling(good_ds, brand_ds, **catboost_params):
    X_good = good_ds.drop("labels", axis=1)
    y_good = good_ds.labels

    X_brand = brand_ds.drop("labels", axis=1)
    y_brand = brand_ds.labels

    X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(X_good, y_good, test_size=0.20)
    X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_brand, y_brand, test_size=0.20)

    train_dataset_g = Pool(data=X_train_g,
                    label=y_train_g)

    eval_dataset_g = Pool(data=X_test_g,
                    label=y_test_g)
    
    train_dataset_b = Pool(data=X_train_b,
                    label=y_train_b)

    eval_dataset_b = Pool(data=X_test_b,
                    label=y_test_b)
    
    model_good = CatBoostClassifier(**catboost_params)
    model_brand = CatBoostClassifier(**catboost_params)

    model_good.fit(train_dataset_g, eval_set=eval_dataset_g)
    model_brand.fit(train_dataset_b, eval_set=eval_dataset_b)

    pred_good = model_good.predict(X_test_g)
    pred_brand = model_brand.predict(X_test_b)

    return pred_good, y_test_g, pred_brand, y_test_b, model_good, model_brand


In [26]:
pred_good, y_test_g, pred_brand, y_test_b, model_good, model_brand = modeling(good_train, brand_train)

Learning rate set to 0.117502
0:	learn: 0.9877273	test: 0.9880601	best: 0.9880601 (0)	total: 241ms	remaining: 4m
1:	learn: 0.9019761	test: 0.9024980	best: 0.9024980 (1)	total: 405ms	remaining: 3m 22s
2:	learn: 0.8321732	test: 0.8333702	best: 0.8333702 (2)	total: 558ms	remaining: 3m 5s
3:	learn: 0.7746585	test: 0.7760567	best: 0.7760567 (3)	total: 724ms	remaining: 3m
4:	learn: 0.7280938	test: 0.7296577	best: 0.7296577 (4)	total: 881ms	remaining: 2m 55s
5:	learn: 0.6876252	test: 0.6897444	best: 0.6897444 (5)	total: 1.05s	remaining: 2m 53s
6:	learn: 0.6517312	test: 0.6537247	best: 0.6537247 (6)	total: 1.2s	remaining: 2m 50s
7:	learn: 0.6219734	test: 0.6241312	best: 0.6241312 (7)	total: 1.36s	remaining: 2m 48s
8:	learn: 0.5950112	test: 0.5976997	best: 0.5976997 (8)	total: 1.51s	remaining: 2m 46s
9:	learn: 0.5713242	test: 0.5743147	best: 0.5743147 (9)	total: 1.65s	remaining: 2m 43s
10:	learn: 0.5514158	test: 0.5545168	best: 0.5545168 (10)	total: 1.79s	remaining: 2m 40s
11:	learn: 0.5319833	

In [27]:
'''
classes = np.unique(y)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
class_weights = dict(zip(classes, weights))
class_weights
'''


"\nclasses = np.unique(y)\nweights = compute_class_weight(class_weight='balanced', classes=classes, y=y)\nclass_weights = dict(zip(classes, weights))\nclass_weights\n"

In [28]:
print(classification_report(y_test_g,pred_good))

              precision    recall  f1-score   support

      B-GOOD       0.90      0.88      0.89      4553
      I-GOOD       0.32      0.09      0.14        66
           O       0.95      0.96      0.96     12516

    accuracy                           0.94     17135
   macro avg       0.72      0.64      0.66     17135
weighted avg       0.94      0.94      0.94     17135



In [29]:
print(classification_report(y_test_b, pred_brand))

              precision    recall  f1-score   support

     B-BRAND       0.83      0.70      0.76      2720
     I-BRAND       0.70      0.40      0.51       445
           O       0.92      0.97      0.94     12541

    accuracy                           0.91     15706
   macro avg       0.82      0.69      0.74     15706
weighted avg       0.90      0.91      0.90     15706



In [30]:
test_df = pd.read_csv("data/test_dataset.csv")
sub_df = pd.read_csv("data/sample_submission.csv")


In [31]:
test_df

Unnamed: 0,id,name
0,0,"469-210 ЕРМАК Клей универсальный, 15мл, блистер"
1,1,Торт СЛАДУШКА Зимняя вишня 700г
2,2,"Смеситель ""CALORIE"" 1023 А06 д/кухни"
3,3,Лимон 50гр БАР
4,4,"Коньяк САРАДЖИШВИЛИ 5 лет 0,5л Грузия"
...,...,...
4995,4995,"774352 Рамка 2П., сл. кость"
4996,4996,Энерг. напиток Red Bull 0.25л
4997,4997,36/025 Наконечники (т. никель) шт
4998,4998,Шоколад РиттерСпорт мол.с цел.миндалем 100г


In [32]:
sub_df

Unnamed: 0,id,good,brand
0,0,"товар1,товар2","бренд1,бренд2"
1,1,товар,бренд
2,2,,
3,3,,
4,4,,
...,...,...,...
4995,4995,,
4996,4996,,
4997,4997,,
4998,4998,,


In [33]:
test_df['name'] = test_df['name'].apply(clean_text)

In [34]:
test_df.head()

Unnamed: 0,id,name
0,0,ермак клей универсальный блистер
1,1,торт сладушка зимняя вишня
2,2,смеситель calorie кухни
3,3,лимон бар
4,4,коньяк сараджишвили лет грузия


In [35]:
test_df["tokens"] = test_df["name"].str.lower().str.split()

In [36]:
test_df.head()

Unnamed: 0,id,name,tokens
0,0,ермак клей универсальный блистер,"[ермак, клей, универсальный, блистер]"
1,1,торт сладушка зимняя вишня,"[торт, сладушка, зимняя, вишня]"
2,2,смеситель calorie кухни,"[смеситель, calorie, кухни]"
3,3,лимон бар,"[лимон, бар]"
4,4,коньяк сараджишвили лет грузия,"[коньяк, сараджишвили, лет, грузия]"


In [37]:
test_df.to_csv("test.csv")

In [59]:
def subm_good(x):
    B_G = []
    I_G = []
    
    count = 0
    for i in range(len(x["tokens"]) - 1):
        if count == 0 and (x["results"][i] == "B-GOOD"):
            B_G.append(x['tokens'][i])
            I_G.append("")
            count+=1
            
        elif str(x["results"][i]) == "I-GOOD":
            I_G.append(" " + x['tokens'][i])
            B_G.append("")

    print(B_G)
    df = pd.DataFrame({"b": B_G, "i": I_G})
    df["good"] = df["b"] + df["i"]
    s = df["good"].to_list()
    
    return "".join(s)

In [60]:
def subm_brand(x):
    B_B = []
    I_B = []

    count=0
    for i in range(len(x["tokens"]) - 1):
        if x["results"][i] == "B-BRAND" and count == 0:
            B_B.append(x['tokens'][i])
            I_B.append("")
            count+=1
        elif str(x["results"][i]) == "I-BRAND":
            I_B.append(" " + x['tokens'][i])
            B_B.append("")

    df = pd.DataFrame({"b": B_B, "i": I_B})
    df["brand"] = df["b"] + df["i"]
    s = df["brand"].to_list()
    
    return "".join(s)

In [61]:

def submission_good(test_df):
    results = []


    tokenss = to_series(test_df["tokens"])
    dataset = tokenss.apply(lambda x: w2v_model.wv.word_vec(x))
    dtv = pd.DataFrame(dataset.to_list())
    pred = model_good.predict(dtv)
    test_df['len'] = test_df["tokens"].apply(lambda x: len(x))
    
    pr = pred.copy()
    for i in range(len(test_df["len"])):
        results.append(pr[:test_df["len"][i]].flatten().tolist())
        idx = int(test_df["len"][i])
        pr =  pr[idx:]

    test_df["results"] = results 
    test_df["good"] = test_df.apply(lambda x: subm_good(x), axis=1)

    return test_df["good"]
    

In [62]:
def submission_brand(test_df):
    results = []


    tokenss = to_series(test_df["tokens"])
    dataset = tokenss.apply(lambda x: w2v_model.wv.word_vec(x))
    dtv = pd.DataFrame(dataset.to_list())
    pred = model_brand.predict(dtv)
    test_df['len'] = test_df["tokens"].apply(lambda x: len(x))
    
    pr = pred.copy()
    for i in range(len(test_df["len"])):
        results.append(pr[:test_df["len"][i]].flatten().tolist())
        idx = int(test_df["len"][i])
        pr =  pr[idx:]

    test_df["results"] = results 
    test_df["brand"] = test_df.apply(lambda x: subm_brand(x), axis=1)

    return test_df["brand"]

In [63]:
good = submission_good(test_df)
        

  dataset = tokenss.apply(lambda x: w2v_model.wv.word_vec(x))


['клей']
['торт']
['смеситель']
[]
['коньяк']
['пластина']
['рис']
['труба']
['консервы']
['одеяло']
[]
['пюре']
['сироп']
['крем']
[]
[]
[]
[]
['чипсы']
['трусы']
['леденцы']
['печенье']
['майонез']
['штукатурка']
['шайба']
['корм']
['саморез']
['томаты']
['саморез']
[]
['держатель']
['рюкзак']
['джинсы']
['консервы']
[]
['сок']
['куртка']
['серьги']
['пельмени']
['джеггинсы']
[]
['фильтр']
[]
[]
['платье']
['футболка']
[]
['антенна']
['болт']
['линолеум']
['болт']
['коврик']
['пиво']
['тархун']
['пенал']
['ремень']
['футболка']
['шоколад']
['молоко']
[]
[]
[]
['диск', '']
['рюкзак']
[]
['дезодорант']
['болт']
[]
[]
['огурцы']
['молоко']
['футболка']
['арахис']
['пленка']
['арахис']
[]
['лимонад']
['мармелад']
[]
['опора']
['молоко']
['конфеты']
['крем']
['напиток']
['соус']
[]
['одеяло']
[]
['носки']
['перец']
['сумка']
['адаптер']
['шприц']
['саморез']
['печенье']
[]
['мармелад']
['сок']
['вино']
['саморез']
['майонез']
['саморез']
['пряники']
['батончик']
[]
['гель']
[]
['цепь']
[]

In [64]:
brand = submission_brand(test_df)

  dataset = tokenss.apply(lambda x: w2v_model.wv.word_vec(x))


In [65]:
sub_df["good"] = good
sub_df["brand"] = brand

In [66]:
sub_df.to_csv("submission_all.csv")

In [46]:
def create_submit(row):
    good = []

    for i in row:
        try:
            word_array = w2v_model.wv.word_vec(i)
            dataset = pd.DataFrame(word_array.reshape((1, 300)))
            pred = model_good.predict(dataset)
            if pred[0][0] == "B-GOOD":
                good.append(i)
        except Exception as e:
            pass
    
    return ''.join(good)



In [47]:
def create_submit_brand(row):
    brand = []

    for i in row:
        try:
            word_array = w2v_model.wv.word_vec(i)
            dataset = pd.DataFrame(word_array.reshape((1, 300)))
            pred = model_brand.predict(dataset)
            if pred[0][0] == "B-BRAND":
                brand.append(i)
        except Exception as e:
            pass
    
    return ''.join(brand)

In [48]:
test_df["good"] = test_df["tokens"].apply(create_submit)

  word_array = w2v_model.wv.word_vec(i)


KeyboardInterrupt: 

In [None]:
test_df["brand"] = test_df["tokens"].apply(create_submit_brand)

In [None]:
test_df.info()

In [None]:
test_df

In [None]:
test_df.to_csv('br.csv')

In [None]:
sub_df['good'] = test_df["good"]

In [None]:
sub_df['brand'] = test_df["brand"]

In [None]:
sub_df.fillna('')

In [None]:
sub_df.to_csv("submission.csv")

In [None]:
s = w2v_model.wv.word_vec("штукатурка")