In [478]:
import numpy as np
import pandas as pd
from scipy import sparse

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion

import nltk
# nltk.download("stopwords")
from nltk.corpus import stopwords

from collections import Counter
from tqdm import tqdm

In [479]:
train = pd.read_parquet('data_fusion_train.parquet')
train = train.loc[train.category_id != -1, ].drop_duplicates(['item_name'])

# symbols = (u"абвгдеёжзийклмнопрстуфхцчшщъыьэюя",
#            u"abvgdeejzijklmnoprstufhzcss_y_eua")

symbols = ("abcdefghijklmnopqrstuvwyz",
           "абкдефгхижклмнопкрстюввиз")
tr = {ord(a): ord(b) for a, b in zip(*symbols)}

train.item_name = train.item_name.str.lower().str.translate(tr).str.replace('x', 'кс').str.replace('[^\w\s]','')
train.shape

(48225, 9)

In [480]:
train.sample(10)

Unnamed: 0,receipt_id,receipt_dayofweek,receipt_time,item_name,item_quantity,item_price,item_nds_rate,category_id,brands
69544,154548,3,19:13,25 мм сверло по металлу р6м5 виз самара,3.0,6,6,107,
782432,1723628,1,17:54,винный игр напиток санто стефано сицилийская с...,1.0,12,1,0,
850858,1872742,3,18:29,вода триви 05л газ пэт,1.0,6,1,83,
4800148,496954,3,08:23,арахис беерка 30гр,1.0,5,-1,81,beerka
11603198,5394829,2,09:06,2 натрия хлорид5мл н10рр дин0,1.0,8,6,38,
810833,1786213,4,18:00,холодец домашний 420г,1.0,9,2,74,
1013532,2225624,4,15:12,хлеб28 пирожслоенсырнач80г,1.0,7,2,84,
1306545,2864667,2,18:14,смесь магги на второе дгреч покупеч том свин 41г,1.0,8,1,77,магги
33043744,2261392,5,23:48,салат мясной,2.0,14,-1,71,
60023,131981,4,15:39,чипсы лейс сметаназелень 80г,1.0,8,1,204,лейс


In [481]:
pd.concat([train.category_id.value_counts(normalize=True), train.category_id.value_counts(normalize=False)], axis=1).reset_index().tail(7)

Unnamed: 0,index,category_id,category_id.1
89,35,0.000518,25
90,26,0.000456,22
91,102,0.000394,19
92,101,0.000353,17
93,46,0.000311,15
94,100,0.00029,14
95,97,0.00027,13


In [482]:
# train.loc[train.item_name.str.find('мл') != -1]

In [483]:
# from collections import Counter
# from nltk.stem.snowball import SnowballStemmer 
# stemmer = SnowballStemmer("russian")

# Counter([stemmer.stem(word) for item in train.loc[train.category_id == 1, 'item_name'].tolist() 
#                             for word in tokenizer.tokenize(item.lower())])

In [484]:
tokenizer = nltk.WhitespaceTokenizer()

cat_words = dict()
for cat_id in np.sort(train.category_id.unique()):
    cat_words[cat_id] = set([word for item in train.loc[train.category_id == cat_id, 'item_name'].tolist() 
                                  for word in tokenizer.tokenize(item.lower())])

In [485]:
train.loc[train.category_id == 0, 'item_name'].shape

(2356,)

In [495]:
Counter([word[:4] for item in train.loc[train.category_id == 100, 'item_name'].tolist() 
                  for word in tokenizer.tokenize(item.lower()) if len(word) > 4]).most_common(10)

[('нако', 3),
 ('лпп6', 3),
 ('щитк', 2),
 ('футб', 2),
 ('супп', 2),
 ('бокс', 2),
 ('черн', 2),
 ('детс', 2),
 ('дс70', 1),
 ('гюар', 1)]

In [496]:
train.loc[train.category_id == 100, 'item_name']

39253                                     наколенники 2шт имп
343548      дс705м с щитки футбольные схин гюардс темносин...
1696844     сп2165430 с  щитки футбольные и нк кхрг грд го...
1956583     лпп639 м  суппорт колена кскнее сюппорт бежевы...
6613964                                 наколенники 19кс15кс1
7882901               дак20000 с наколенник кнее пад белый рс
9075348     г 12  перчатки боксерские боксинг гловес черны...
11129255                                     шлем защдетс рис
17684578                лпп604с лпп604 суппорт голеностопа пс
18775818    некж11б 0 защита шеи детская кидс некк протект...
21004466    17ики72 с шлем детский ики кидс хелмет зелёный рс
21386437                              карабин с фиксатором м6
25622100      4355мю нс  шингарты кют фингер миттс желтый рнс
41866156                       флаг из пэ с держателем  14кс2
Name: item_name, dtype: object

In [487]:
cat_subwords = dict()
for cat_id in np.sort(train.category_id.unique()):
    cat_subwords[cat_id] = []
    for length in range(3, 8):
        cat_subwords[cat_id].extend(list(dict(Counter([word[:length] for item in train.loc[train.category_id == cat_id, 'item_name'].tolist() 
                                                       for word in tokenizer.tokenize(item.lower()) if (len(word) > length)]
                                                     ).most_common(10)).keys()))
    cat_subwords[cat_id] = set(cat_subwords[cat_id])

In [488]:
sum([len(el) for el in cat_subwords.values()])

4791

In [405]:
Counter([word for item in train.loc[:, 'item_name'].tolist() 
              for word in tokenizer.tokenize(item.lower()) if len(word) == 2]).most_common(10)

[('шт', 1579),
 ('из', 1119),
 ('гр', 903),
 ('кг', 722),
 ('на', 559),
 ('мл', 551),
 ('1л', 539),
 ('11', 517),
 ('жб', 505),
 ('05', 457)]

In [406]:
list_of_substrings = []
for subwords in cat_subwords.values():
    list_of_substrings.extend(list(subwords))
len(list_of_substrings)

4791

In [416]:
len(set(list_of_substrings))

3389

In [423]:
list_of_substrings = [word for word, num in Counter(list_of_substrings).items() if num == 1]

In [424]:
import pickle
pickle.dump(list_of_substrings, open('list_of_substrings', 'wb'))

### Попробуем смешать нашу модель tfidf и bag-of-words с подстроковыми токенами

In [453]:
train = pd.read_parquet('data_fusion_train.parquet')
train = train.loc[train.category_id != -1, ].drop_duplicates(['item_name'])

symbols = ("abcdefghijklmnopqrstuvwyz",
           "абкдефгхижклмнопкрстюввиз")
tr = {ord(a): ord(b) for a, b in zip(*symbols)}

train['item_name_new'] = train.item_name.str.lower().str.translate(tr).str.replace('x', 'кс').str.replace('[^\w\s]','')
train.shape

(48225, 10)

In [454]:
list_of_substrings = pickle.load(open('list_of_substrings', 'rb'))

In [455]:
for substring in tqdm(list_of_substrings):
    train[substring] = (train.item_name_new.str.find(substring) != -1).astype(np.int8)

train.shape

100%|██████████| 2737/2737 [01:34<00:00, 29.03it/s]


(48225, 2747)

In [456]:
X_new = sparse.csr_matrix(train.iloc[:, -len(list_of_substrings):])

In [464]:
stop = stopwords.words('russian') + list('абвгдеёжзийклмнопрстуфхцчшщъыьэюя')
union = FeatureUnion([("tdidf", TfidfVectorizer(stop_words=stop, min_df=5, ngram_range=(1, 1))),
                      ("count", CountVectorizer(stop_words=stop, min_df=3, ngram_range=(2, 5), binary=True))])

X = union.fit_transform(train.item_name)
y = train.category_id
print(X.shape, y.shape)

(48225, 16638) (48225,)


In [465]:
X = sparse.hstack([X, X_new])
X.shape

(48225, 19375)

In [466]:
# scores = cross_val_score(estimator=LogisticRegression(), X=X, y=y, cv=5, scoring='f1_weighted', n_jobs=-1)
# scores

In [473]:
%%time

clf = LogisticRegressionCV(Cs=[0.1, 1, 3], max_iter=500, cv=5, solver='sag', scoring='f1_weighted', n_jobs=-1)
clf.fit(X, y)

CPU times: user 16min 55s, sys: 1.22 s, total: 16min 57s
Wall time: 6min 17s


LogisticRegressionCV(Cs=[0.1, 1, 3], class_weight=None, cv=5, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=500, multi_class='auto', n_jobs=-1, penalty='l2',
                     random_state=None, refit=True, scoring='f1_weighted',
                     solver='sag', tol=0.0001, verbose=0)

In [474]:
clf.C_

array([3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
       3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.])

In [475]:
import pickle
pickle.dump(union, open('tfidf', 'wb'))
pickle.dump(clf, open('clf_task1', 'wb'))

In [484]:
sub_number = 12

In [485]:
import zipfile 
compression = zipfile.ZIP_DEFLATED

submission_name = f'submission_{sub_number}.zip'
with zipfile.ZipFile(submission_name, 'w') as zipObj:
    for filename in [
        'clf_task1',
        'tfidf',
        'script.py',
        'list_of_substrings'
    ]:
        zipObj.write(
            f'{filename}', 
            arcname=filename, 
            compress_type=compression
        )
    print(zipObj.namelist())

['clf_task1', 'tfidf', 'script.py', 'list_of_substrings']


