In [1]:
import sys
import re

sys.path.append('..')

import numpy as np
import pandas as pd
import scipy.stats as sps

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from nltk.stem.snowball import SnowballStemmer

from src.utils.downcasting import downcast_dtypes

sns.set(font_scale=1.2)
%matplotlib inline

In [2]:
# I have found, that this setting is the best
DO_CLEANING = True
DO_STEMMING = True
RES_DIM = 20

random_state = 42

In [3]:
def correct_name(x):
    """Corrects string.
    
    :param x: string
    
    :returns: corrected string
    """
    x = x.lower() # all letters lower case
    x = x.partition('[')[0] # partition by square brackets
    x = x.partition('(')[0] # partition by curly brackets
    x = re.sub('[^A-Za-z0-9А-Яа-я]+', ' ', x) # remove special characters
    x = x.replace('  ', ' ') # replace double spaces with single spaces
    x = x.strip() # remove leading and trailing white space
    return x

# Text features

As we found in 1.0-db-EDA.ipynb we can produce text features for items. It will be helpful, because there are a lot of items and we want to treat similar similarly.

## TFiDF + TruncatedSVD

First approach is to use TFiDF + TruncatedSVD. We will use $50$ features and in the future we will be able to select just part of them.

It will be difficult to find optimum hyperparameters because operation of learning TFiDF separately and merging it with train seems too complicated.

May be will be better to find reasonable parameters for some model by hand (for example for random forest) and fix them.

In [4]:
items = pd.read_csv('../data/processed/items.csv')
items.head()

Unnamed: 0,item_name,item_id,item_category_id,item_in_test
0,ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.),0,40,False
1,ABBYY FineReader 12 Professional Edition Full ...,1,76,False
2,В ЛУЧАХ СЛАВЫ (UNV),2,40,False
3,ГОЛУБАЯ ВОЛНА (Univ),3,40,False
4,КОРОБКА (СТЕКЛО),4,40,False


In [5]:
names = items.item_name

Let's clean item names.

In [6]:
if DO_CLEANING:
    names = names.apply(correct_name)

Let's stem item names.

In [7]:
if DO_STEMMING:
    stemmer = SnowballStemmer('russian')
    names = names.apply(lambda x: stemmer.stem(x))

Make TFiDF transformation.

In [8]:
transformer = TfidfVectorizer(ngram_range=(1, 1), max_df=1.0)
tfidf_data = transformer.fit_transform(names)

In [9]:
tfidf_data.shape

(22170, 18601)

In [10]:
svd = TruncatedSVD(n_components=RES_DIM, random_state=random_state)
truncated_data = svd.fit_transform(tfidf_data)

In [11]:
truncated_data.shape

(22170, 20)

Save result on disc.

In [12]:
columns = [f'item_name_tfidf_truncated-svd_{i+1}' for i in range(RES_DIM)]
df = pd.DataFrame(truncated_data, columns=columns)

df = downcast_dtypes(df, to_bytes=16)
df.to_hdf('../data/processed/text/tfidf_truncated-svd.h5', 'table')