In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy.sparse import hstack

import re
import datetime
import multiprocessing
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.metrics import accuracy_score

import xgboost as xgb

from nltk.corpus import stopwords
from collections import Counter

In [2]:
russian_stopwords = stopwords.words("russian")
english_stopwords = stopwords.words("english")
stopwords_rus_dict = Counter(russian_stopwords)
stopwords_eng_dict = Counter(english_stopwords)

In [3]:
SEED = 42

In [31]:
def RemainLettersNumsInLowerCase(data, columns_list):
    '''Simple change text in dataframe inplace
    Remain only letters and numbers and making string in lower case
    
    Parameters:
        data - dataframe
        columns_list - list of columns' names where changes must be
    '''
    
    for column in columns_list:
        data[column] = data[column].apply(lambda x: re.sub('[\W]+', ' ', x.lower()))

def ThrowStopWords(series_column, stopwords_dict):
    '''Get dataframe column with text and throw out stopwords
    Return changed column
    
    Parameters:
        series_column - series where words are contained
        stopwords_dict - Counter dict of stopwords -words that must be dropped
        
    Return:
        changed series
    '''
    
    series = []
    for i, string in enumerate(series_column):
        series.append(' '.join(word for word in string.split() if not stopwords_dict[word]))
    return series

def TF_IDF(train, test, column, **params):
    '''Apply TF_IDF for the choosen column in train and test
    for train and test where there is text
    
    Parameters:
        train, test - 2 dataframes
        column - column's name in those dataframe where tf_idf should be done
        **params - dict of params of tf_idf method
        
    Return:
        Encoded in tf_idf train and test samples
    '''

    vectorizer = TfidfVectorizer(**params)
    train_column_tfidf = vectorizer.fit_transform(train[column])
    test_column_tfidf = vectorizer.transform(test[column])
    
    return train_column_tfidf, test_column_tfidf

def DivideOnFirstLevel(text):
    '''Need to divide category by first level
    Function is mostly usefull for this task
    
    Parameters:
        text - description in a view: a|b|c, where a - 1st level category, b - 2d etc
    
    Return:
        Category's number
    '''
    
    #it can be extented to 2, 3 etc levels on need
    d = {r'Бытовая электроника':0, r'Для дома и дачи':1,r'Личные вещи':2,r'Хобби и отдых':3}
    
    return d[re.split(r'[|]', text)[0]]

In [5]:
df = pd.read_csv('train.csv',index_col='item_id')
df.head()

Unnamed: 0_level_0,title,description,price,category_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Картина,Гобелен. Размеры 139х84см.,1000.0,19
1,Стулья из прессованной кожи,Продам недорого 4 стула из светлой прессованно...,1250.0,22
2,Домашняя мини баня,"Мини баня МБ-1(мини сауна), предназначена для ...",13000.0,37
3,"Эксклюзивная коллекция книг ""Трансаэро"" + подарок","Продам эксклюзивную коллекцию книг, выпущенную...",4000.0,43
4,Ноутбук aser,Продаётся ноутбук ACER e5-511C2TA. Куплен в ко...,19000.0,1


In [6]:
category = pd.read_csv('category.csv')
catlevel_1 = category['name'].apply(DivideOnFirstLevel)
dictlevel_1 = dict(zip(list(range(df['category_id'].unique().shape[0])), catlevel_1))

In [8]:
#exchange category_id in sample
df['category_id'] = df['category_id'].apply(lambda x: dictlevel_1[x])

In [9]:
print(f'Shape df before: {df.shape}')
df = df.drop_duplicates().reset_index(drop=True)
print(f'Shape df after: {df.shape}')

Shape df before: (489517, 4)
Shape df after: (489200, 4)


In [11]:
#during attempts it seems to be better
df['fulldiscr'] = df['title'] + ' ' + df['description']

In [12]:
#to speed up training
print(f'Shape df: {df.shape}')
df_frac = df.sample(frac=0.5, random_state=SEED)
print(f'Shape df_frac: {df_frac.shape}')

Shape df: (489200, 5)
Shape df_frac: (244600, 5)


In [13]:
train, test, y_train, y_test = train_test_split(df_frac.drop(['title', 'description', 'category_id'], axis=1) 
                                                , df_frac['category_id'], test_size=0.2, random_state=SEED)

In [14]:
RemainLettersNumsInLowerCase(train, ['fulldiscr'])
RemainLettersNumsInLowerCase(test, ['fulldiscr'])

train['fulldiscr'] = ThrowStopWords(train['fulldiscr'], stopwords_rus_dict)
test['fulldiscr'] = ThrowStopWords(test['fulldiscr'], stopwords_rus_dict)

#train['description'] = ThrowStopWords(train['description'], stopwords_rus_dict)
#test['description'] = ThrowStopWords(test['description'], stopwords_rus_dict)
#train['title'] = ThrowStopWords(train['title'], stopwords_rus_dict)
#test['title'] = ThrowStopWords(test['title'], stopwords_rus_dict)

In [25]:
s = ''
for row in train['fulldiscr']:
    s += ' ' + row

cnt = len(set(s.split()))
print(f'Count of got unique words {cnt}')

Count of got unique words 310639


In [33]:
#train_title_tfidf, test_title_tfidf = TF_IDF(train, test,'title', 5)
#train_descr_scaled, test_descr_tfidf = TF_IDF(train, test,'description', 5)

params1 = {'min_df': 5}
params2 = {'min_df': 5, 'max_df': 0.99, 'ngram_range': (1, 2)}

train_descr_tfidf, test_descr_tfidf = TF_IDF(train, test, 'fulldiscr', **params2)

In [36]:
train_all = hstack([train_descr_tfidf, train['price'].values.reshape(-1,1)])
test_all = hstack([test_descr_tfidf, test['price'].values.reshape(-1,1)])

**1-st base attempt**

In [42]:
#ordinary parameters for tf_idf params1
params = {'objective': 'multi:softmax'
          , 'n_jobs': multiprocessing.cpu_count()
          , 'num_class': y_train.unique().shape[0]
          , 'tree_method': 'hist'
          , 'grow_policy': 'lossguide'
          , 'max_depth': 0
          , 'max_leaves': 31
          , 'reg_alpha': 1.5
          , 'reg_lambda': 2
          , 'learning_rate': 0.1
          , 'subsample': 0.8
          , 'colsample_bytree': 0.8
          , 'gamma': 1
          , 'eval_metric': 'merror'
          , 'random_state': SEED
         }

start = datetime.datetime.now()

dtrain = xgb.DMatrix(scipy.sparse.csc_matrix(train_all), label=y_train)
dtest = xgb.DMatrix(scipy.sparse.csc_matrix(test_all), label=y_test)

clf = xgb.train(params, dtrain, num_boost_round=200
                , early_stopping_rounds=25, verbose_eval=25
                , evals= [(dtrain, 'train'), (dtest, 'test')])

print(f'Time taken:{datetime.datetime.now() - start}')

[0]	train-merror:0.28501	test-merror:0.28346
Multiple eval metrics have been passed: 'test-merror' will be used for early stopping.

Will train until test-merror hasn't improved in 25 rounds.
[25]	train-merror:0.17444	test-merror:0.17584
[50]	train-merror:0.14408	test-merror:0.14708
[75]	train-merror:0.12335	test-merror:0.12704
[100]	train-merror:0.10922	test-merror:0.11415
[125]	train-merror:0.09902	test-merror:0.10429
[150]	train-merror:0.09166	test-merror:0.09792
[175]	train-merror:0.08533	test-merror:0.09366
[199]	train-merror:0.08041	test-merror:0.08958
Time taken:1:08:39.044626


In [43]:
#tf_idf params1
print(f'train accuracy: {accuracy_score(y_train, clf.predict(dtrain))}')
print(f'test accuracy: {accuracy_score(y_test, clf.predict(dtest))}')

train accuracy: 0.9195932134096484
test accuracy: 0.9104251839738349


In [51]:
clf.save_model('xgb_1')

**Little modifications and one more attempt**

Here I use less trees to speed up training

In [39]:
#ordinary parameters for 
params = {'objective': 'multi:softmax'
          , 'n_jobs': multiprocessing.cpu_count()
          , 'num_class': y_train.unique().shape[0]
          , 'tree_method': 'hist'
          , 'grow_policy': 'lossguide'
          , 'max_depth': 0
          , 'max_leaves': 31
          , 'reg_alpha': 1.5
          , 'reg_lambda': 2
          , 'learning_rate': 0.1
          , 'subsample': 0.8
          , 'colsample_bytree': 0.75
          , 'gamma': 1
          , 'eval_metric': 'merror'
          , 'random_state': SEED
         }

start = datetime.datetime.now()

dtrain = xgb.DMatrix(scipy.sparse.csc_matrix(train_all), label=y_train)
dtest = xgb.DMatrix(scipy.sparse.csc_matrix(test_all), label=y_test)

clf2 = xgb.train(params, dtrain, num_boost_round=100
                , early_stopping_rounds=25, verbose_eval=25
                , evals= [(dtrain, 'train'), (dtest, 'test')])

print(f'Time taken:{datetime.datetime.now() - start}')

[0]	train-merror:0.28418	test-merror:0.28269
Multiple eval metrics have been passed: 'test-merror' will be used for early stopping.

Will train until test-merror hasn't improved in 25 rounds.
[25]	train-merror:0.17303	test-merror:0.17537
[50]	train-merror:0.14271	test-merror:0.14636
[75]	train-merror:0.12328	test-merror:0.12657
[99]	train-merror:0.10961	test-merror:0.11490
Time taken:1:04:04.572511


In [42]:
#tf_idf params2
print(f'train accuracy: {accuracy_score(y_train, clf2.predict(dtrain))}')
print(f'test accuracy: {accuracy_score(y_test, clf2.predict(dtest))}')

train accuracy: 0.890392477514309
test accuracy: 0.8850981193785773


In [41]:
clf2.save_model('xgb_2')

Accuracy is a desired metric (not by me) and baseline is 0.9 on test. It can be much better if tune parameters using for example optuna or hyperopt but it's not for my laptop using so much data also tf_idf should be tuned to get better score.

Anyways this already beat baseline without almost overfitting on test created from 'train' dataset but I'm sure there will no problems and getting worse score if do the same steps on full 'train' and 'test' csv files