In [1]:
import numpy as np
import pandas as pd
from scipy import sparse

from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import FeatureUnion

from sklearn.preprocessing import OneHotEncoder

import string
import nltk
from nltk import SnowballStemmer
from nltk.corpus import stopwords

In [2]:
train = pd.read_parquet('data_fusion_train.parquet')
train = train[~train.category_id.isin({-1, 121})].drop_duplicates('item_name') # удалим сразу странный пакет с классом 121 (и дубликаты)
train.drop(columns=['receipt_id', 'brands', 'receipt_dayofweek', 'receipt_time', 
                    'item_quantity', 'item_price', 'item_nds_rate'], inplace=True)
print(train.shape)

(48225, 2)


In [3]:
# train.rename(columns={'receipt_id': 'id'}).sample(10).to_parquet('task1_test_for_user.parquet')

In [4]:
def preprocess(text):
    text = text.lower()
    text = "".join(c if c not in string.punctuation else f" {c} " for c in text )
    return " ".join(w.strip() for w in text.split())

symbols = ("abcdefghijklmnopqrstuvwyz",
           "абкдефгхижклмнопкрстюввиз")
tr = {ord(a): ord(b) for a, b in zip(*symbols)}

tokenizer = nltk.WhitespaceTokenizer()
# limit_word_cut = lambda sentence: ' '.join([word[:-2] if len(word) > 5 else word for word in tokenizer.tokenize(sentence)])

stemmer = SnowballStemmer("russian")
limit_word_stem = lambda sentence: ' '.join([stemmer.stem(word) for word in tokenizer.tokenize(sentence) if len(word) > 2])

# train['receipt_time'] = train.receipt_time.apply(lambda sentence: sentence[:2]).astype(int)
train['item_name_1'] = train.item_name.apply(preprocess).str.translate(tr).str.replace('x', 'кс')
train['item_name_2'] = train.item_name_1.str.replace('[^\w\s]+', ' ').str.replace('[0-9]+', ' ')
# train['item_name_3'] = train.item_name_2.apply(limit_word_cut)
train['item_name_3'] = train.item_name_2.apply(limit_word_stem)

In [5]:
train.sample(10)

Unnamed: 0,item_name,category_id,item_name_1,item_name_2,item_name_3
218893,Чай 400 мл,70,чай 400 мл,чай мл,ча
24018183,Перфект 85гр,145,перфект 85гр,перфект гр,перфект
460941,Маленькое чудо сливочное SRP (вес),84,маленькое чудо сливочное срп ( вес ),маленькое чудо сливочное срп вес,маленьк чуд сливочн срп вес
12893,СЕЛЬДЬ ПОД ШУБОЙ,71,сельдь под шубой,сельдь под шубой,сельд под шуб
1543589,BЫПEЧKA,84,быпечка,быпечка,быпечк
37213,Тетрадь 48л А5ф клетка на скобе,30,тетрадь 48л а5ф клетка на скобе,тетрадь л а ф клетка на скобе,тетрад клетк скоб
28823474,Петля ПН5- 40 Б/П,106,петля пн5 - 40 б / п,петля пн б п,петл
27601,Помидор мини красный фасованный КНР,80,помидор мини красный фасованный кнр,помидор мини красный фасованный кнр,помидор мин красн фасова кнр
1918305,"Пиво Калнапилис Пилснер светл фильтр 0,568 л. ...",0,"пиво калнапилис пилснер светл фильтр 0 , 568 л...",пиво калнапилис пилснер светл фильтр л ...,пив калнапилис пилснер светл фильтр
3036731,Кукурузная крупа 700г Агромастер,76,кукурузная крупа 700г агромастер,кукурузная крупа г агромастер,кукурузн круп агромастер


In [6]:
# stop_words = stopwords.words('russian') + list('абвгдеёжзийклмнопрстуфхцчшщъыьэюя') + ['из', 'на']

# count1 = CountVectorizer(ngram_range=(3, 5), min_df=2, analyzer="char_wb", lowercase=False)
# count2 = CountVectorizer(ngram_range=(2, 7), min_df=2, analyzer="word", lowercase=False, binary=True, stop_words=stop_words)
# # ohe = OneHotEncoder(categories='auto', drop='first')

# X_1 = count1.fit_transform(train.item_name_1)
# X_2 = count2.fit_transform(train.item_name_2)
# # X_3 = ohe.fit_transform(train.loc[:, ['receipt_dayofweek', 'receipt_time', 'item_nds_rate']])

# X_1.shape, X_2.shape #, X_3.shape

In [18]:
a_0 = CountVectorizer(ngram_range=(3, 5), min_df=3, analyzer="char_wb", lowercase=True ).fit(train.item_name)
a_1 = CountVectorizer(ngram_range=(3, 5), min_df=2, analyzer="char_wb", lowercase=False).fit(train.item_name_1)

In [19]:
unseen_set = set(a_0.vocabulary_.keys()) - set(a_1.vocabulary_.keys())
len(unseen_set)

40126

In [20]:
stop_words = stopwords.words('russian') + list('абвгдеёжзийклмнопрстуфхцчшщъыьэюя') + ['из', 'на', 'для']

count0 = FeatureUnion([("countchar", CountVectorizer(ngram_range=(3, 5), analyzer="char_wb", 
                                                     lowercase=True, vocabulary=unseen_set, binary=True)),
                       ("tfidfchar", TfidfVectorizer(ngram_range=(3, 5), analyzer="char_wb", 
                                                     lowercase=True, vocabulary=unseen_set))])

count1 = FeatureUnion([("countchar", CountVectorizer(ngram_range=(3, 5), min_df=2, analyzer="char_wb", 
                                                     lowercase=False, binary=True)),
                       ("tfidfchar", TfidfVectorizer(ngram_range=(3, 5), min_df=2, analyzer="char_wb", 
                                                     lowercase=False))])

count2 = FeatureUnion([("count", CountVectorizer(ngram_range=(2, 5), min_df=2, analyzer="word", 
                                                 lowercase=False, binary=True, stop_words=stop_words)),
                       ("tfidf", TfidfVectorizer(ngram_range=(2, 5), min_df=2, analyzer="word", 
                                                 lowercase=False, stop_words=stop_words))])
# ohe = OneHotEncoder(categories='auto', drop='first')

X_0 = count0.fit_transform(train.item_name)
X_1 = count1.fit_transform(train.item_name_1)
X_2 = count2.fit_transform(train.item_name_3)
# X_3 = ohe.fit_transform(train.loc[:, ['receipt_dayofweek', 'receipt_time', 'item_nds_rate']])

X_0.shape, X_1.shape, X_2.shape #, X_3.shape

((48225, 80252), (48225, 170198), (48225, 41530))

In [12]:
# ((48225, 85099), (48225, 25014), (48225, 35)) -- benchmark
# ((48225, 85099), (48225, 25967)) -- limit_word_cut without constraint
# limit_word_cut with constraint меньше benchmark почти на 5000
# ((48225, 170198), (48225, 51934)) -- limit_word_cut without constraint (big)
# ((48225, 134406), (48225, 170198), (48225, 54160))

# ((48225, 44722), (48225, 296778), (48225, 54160))
# ((48225, 44722), (48225, 296778), (48225, 53004))

# ((48225, 44722), (48225, 170198), (48225, 41530))
# ((48225, 80252), (48225, 170198), (48225, 41530))

In [21]:
# X = sparse.hstack([X_1, X_2, X_3])
X = sparse.hstack([X_0, X_1, X_2])
y = train.category_id

In [None]:
#!M
%%time

clf = LogisticRegression(n_jobs=-1)
cv_scores = cross_val_score(clf, X, y, cv=5, scoring='f1_weighted')
print(cv_scores)
# [0.84517349 0.84442787 0.82938652 0.81067868 0.79142013]

In [None]:
# [0.84844599 0.84399864 0.82757774 0.81416014 0.78319531] -- benchmark
# [0.846999   0.84056063 0.82701335 0.8122746  0.78744522] -- limit_word_cut with constraint    -- 0.8306330903492433 Public Score
# [0.84745532 0.84031099 0.82679243 0.81283318 0.78761241] -- limit_word_cut without constraint -- 0.8306767182016644 Public Score
# [0.84854438 0.84159432 0.8279734  0.8145432  0.78859838] -- limit_word_cut without constraint (big) -- 0.8318707659176601

In [None]:
#!M
%%time

clf = LogisticRegression(n_jobs=-1)
y_pred = cross_val_predict(clf, X, y, cv=5, method='predict')

score = f1_score(y, y_pred, average='weighted')
print(f'Score: {score:.3f}')
# 0.825

In [22]:
#!M
%%time

clf = LogisticRegression(n_jobs=-1)
clf.fit(X, y)

CPU times: user 1.8 s, sys: 2.27 s, total: 4.06 s
Wall time: 14min 3s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
import pickle
# pickle.dump(ohe, open('ohe', 'wb'))
pickle.dump(count0, open('count0', 'wb'))
pickle.dump(count1, open('count1', 'wb'))
pickle.dump(count2, open('count2', 'wb'))
pickle.dump(clf, open('clf_task2', 'wb'))

In [24]:
sub_number = 28

In [25]:
import zipfile 
compression = zipfile.ZIP_DEFLATED

submission_name = f'submission_{sub_number}.zip'
with zipfile.ZipFile(submission_name, 'w') as zipObj:
    for filename in [
        'count0',
        'count1',
        'count2',
#         'ohe',
        'clf_task2',
        'script.py',
    ]:
        zipObj.write(
            f'{filename}', 
            arcname=filename, 
            compress_type=compression
        )
    print(zipObj.namelist())

['count0', 'count1', 'count2', 'clf_task2', 'script.py']




### Classification Report

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(y, y_pred))

In [None]:
indexes_and_cols = np.sort(y.unique())

In [None]:
conf = pd.DataFrame(data=np.round(confusion_matrix(y, y_pred, normalize='pred'), 3), index=indexes_and_cols, columns=indexes_and_cols)

In [None]:
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

In [None]:
conf