In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion

import nltk
# nltk.download("stopwords")
from nltk.corpus import stopwords

### Подбираем модель

In [241]:
train = pd.read_parquet('data_fusion_train.parquet')
train = train.loc[train.category_id != -1, ].drop_duplicates(['item_name', 'category_id'])
train.shape

(48267, 9)

In [242]:
train.head(5)

Unnamed: 0,receipt_id,receipt_dayofweek,receipt_time,item_name,item_quantity,item_price,item_nds_rate,category_id,brands
1,11,6,20:34,"Молоко 3,2%,шт",2.0,8,2,78,
3,39,4,11:28,"Компот из изюма, 114 ккал",1.0,4,1,71,
4,39,4,11:28,"Макаронные изделия отварные (масло сливочное),...",1.0,4,1,71,
17,56,5,11:42,Кофе Капучино Большой Эден 18,1.0,12,1,70,
40,105,3,01:53,Хлеб на СЫВОРОТКЕ 350г,1.0,7,-1,84,


In [244]:
X = train.item_name
X.shape

(48267,)

In [245]:
y = train.category_id
y.shape

(48267,)

In [246]:
stop = stopwords.words('russian')
tfidf = CountVectorizer(stop_words=stop, min_df=5, max_df=1.0, ngram_range=(1, 5), binary=True, dtype=np.int8)
X_train = tfidf.fit_transform(X)

In [247]:
len(tfidf.vocabulary_)

9991

In [248]:
clf = LogisticRegression(C=10, penalty='l1', solver='liblinear', max_iter=500)
cross_val_score(clf, X_train, y, cv=10, scoring='f1_weighted', n_jobs=-1)



array([0.81923434, 0.8110812 , 0.80437166, 0.79649013, 0.79056385,
       0.77710862, 0.76694897, 0.76726681, 0.74243161, 0.73571583])

In [249]:
clf.fit(X_train, y)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [250]:
coefs_data = pd.DataFrame(clf.coef_)
vocab_dict = {x: y for y, x in sorted(tfidf.vocabulary_.items(), key=lambda item: item[1])}

In [251]:
coefs_data.rename(columns=vocab_dict, inplace=True)
coefs_data.head(3)

Unnamed: 0,00,00 000,00 арт,000,000 порц,0001,0001 услуги,0003,0004,001,...,ясхим,яч,ячеек,ячменное,ячневая,яш,яш порц,яшкино,яшкино 200г,ящик
0,-2.537328,0.0,0.0,-0.657835,0.0,-1.145706,0.0,0.0,0.0,0.0,...,-1.268129,0.0,0.0,1.383563,0.0,0.0,0.0,0.0,0.0,-0.735822
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.704557,0.0,0.0,-2.059612,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [252]:
(coefs_data == 0).all(axis=0).sum(), (coefs_data != 0).any(axis=0).sum()

(2527, 7464)

In [253]:
importance_bool = (coefs_data != 0).any(axis=0).to_numpy()

In [254]:
import pickle
pickle.dump(coefs_data.columns.to_numpy()[importance_bool], open('important_tokens', 'wb'))

### Формируем посылку

In [277]:
train = pd.read_parquet('data_fusion_train.parquet')
train = train.loc[train.category_id != -1, ].drop_duplicates(['item_name'])
# tokens = pickle.load(open('important_tokens', 'rb')) # посылки показали, что скор стабильно хуже
train.shape

((48225, 9), (7464,))

In [303]:
stop = stopwords.words('russian')
union = FeatureUnion([("tdidf", TfidfVectorizer(stop_words=stop, max_features=12500, ngram_range=(1, 1))),
                      ("count", CountVectorizer(stop_words=stop, max_features=7500, ngram_range=(2, 4), binary=True))])

# union = FeatureUnion([("tdidf", TfidfVectorizer(vocabulary=tokens, max_features=7500)),
#                       ("count", CountVectorizer(vocabulary=tokens, max_features=2500, binary=True))])

X_train = union.fit_transform(train.item_name)
y = train.category_id

In [304]:
%%time

clf = LogisticRegressionCV(Cs=[8, 10, 12], max_iter=500, cv=5, solver='sag', scoring='f1_weighted', n_jobs=-1)
clf.fit(X_train, y)

CPU times: user 15min 49s, sys: 1.49 s, total: 15min 51s
Wall time: 6min 1s


LogisticRegressionCV(Cs=[8, 10, 12], class_weight=None, cv=5, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=500, multi_class='auto', n_jobs=-1, penalty='l2',
                     random_state=None, refit=True, scoring='f1_weighted',
                     solver='sag', tol=0.0001, verbose=0)

In [305]:
clf.C_

array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10])

In [306]:
import pickle
pickle.dump(union, open('tfidf', 'wb'))
pickle.dump(clf, open('clf_task1', 'wb'))

In [307]:
sub_number = 10

In [308]:
import zipfile 
compression = zipfile.ZIP_DEFLATED

submission_name = f'submission_{sub_number}.zip'
with zipfile.ZipFile(submission_name, 'w') as zipObj:
    for filename in [
        'clf_task1',
        'tfidf',
        'script.py',
    ]:
        zipObj.write(
            f'{filename}', 
            arcname=filename, 
            compress_type=compression
        )
    print(zipObj.namelist())

['clf_task1', 'tfidf', 'script.py']




## Новый блок (с предобработкой текста)

In [2]:
train = pd.read_parquet('data_fusion_train.parquet')
train = train.loc[train.category_id != -1, ].drop_duplicates(['item_name'])

######### аналогичный кодик добавляется в script для теста #########
symbols = ("abcdefghijklmnopqrstuvwyz",
           "абкдефгхижклмнопкрстюввиз")
tr = {ord(a): ord(b) for a, b in zip(*symbols)}

train.item_name = train.item_name.str.lower().str.translate(tr).str.replace('x', 'кс').str.replace('[^\w\s]','')
####################################################################

train.shape

(48225, 9)

In [3]:
stop = stopwords.words('russian')
union = FeatureUnion([("tdidf", TfidfVectorizer(stop_words=stop, max_features=12500, ngram_range=(1, 1))),
                      ("count", CountVectorizer(stop_words=stop, max_features=7500, ngram_range=(2, 4), binary=True))])

X = union.fit_transform(train.item_name)
y = train.category_id
print(X.shape, y.shape)

(48225, 20000) (48225,)


In [4]:
%%time

clf = LogisticRegressionCV(Cs=[9, 10, 11], max_iter=500, cv=5, solver='sag', scoring='f1_weighted', n_jobs=-1)
clf.fit(X, y)

CPU times: user 11min 57s, sys: 2.04 s, total: 11min 59s
Wall time: 4min 29s


LogisticRegressionCV(Cs=[9, 10, 11], class_weight=None, cv=5, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=500, multi_class='auto', n_jobs=-1, penalty='l2',
                     random_state=None, refit=True, scoring='f1_weighted',
                     solver='sag', tol=0.0001, verbose=0)

In [5]:
import pickle
pickle.dump(union, open('tfidf', 'wb'))
pickle.dump(clf, open('clf_task1', 'wb'))

In [6]:
sub_number = 11

In [7]:
import zipfile 
compression = zipfile.ZIP_DEFLATED

submission_name = f'submission_{sub_number}.zip'
with zipfile.ZipFile(submission_name, 'w') as zipObj:
    for filename in [
        'clf_task1',
        'tfidf',
        'script.py',
    ]:
        zipObj.write(
            f'{filename}', 
            arcname=filename, 
            compress_type=compression
        )
    print(zipObj.namelist())

['clf_task1', 'tfidf', 'script.py']


