In [1]:
import sys

sys.path.append('..')

import numpy as np
import pandas as pd
import scipy.stats as sps

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from src.utils.downcasting import downcast_dtypes

sns.set(font_scale=1.2)
%matplotlib inline

In [2]:
random_state = 42

# Text features

As we found in 1.0-db-EDA.ipynb we can produce text features for items. It will be helpful, because there are a lot of items and we want to treat similar similarly.

## TFiDF + TruncatedSVD

First approach is to use TFiDF + TruncatedSVD. We will use $100$ features and in the future we will be able to select just part of them.

It will be difficult to find optimum hyperparameters because operation of learning TFiDF separately and merging it with train seems too complicated.

May be will be better to find reasonable parameters for some model by hand (for example for random forest) and fix them.

Here we don't use any text transformation like lemmatization or stemming.

In [3]:
res_dim = 100

In [4]:
items = pd.read_csv('../data/processed/items.csv')
items.head()

Unnamed: 0,item_name,item_id,item_category_id,item_in_test
0,ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.),0,40,False
1,ABBYY FineReader 12 Professional Edition Full ...,1,76,False
2,В ЛУЧАХ СЛАВЫ (UNV),2,40,False
3,ГОЛУБАЯ ВОЛНА (Univ),3,40,False
4,КОРОБКА (СТЕКЛО),4,40,False


In [5]:
transformer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.5)
tfidf_data = transformer.fit_transform(items.item_name)

In [6]:
tfidf_data.shape

(22170, 64565)

In [7]:
svd = TruncatedSVD(n_components=res_dim, random_state=random_state)
truncated_data = svd.fit_transform(tfidf_data)

In [8]:
truncated_data.shape

(22170, 100)

Save result on disc.

In [9]:
columns = [f'item_name_tfidf_truncated-svd_{i}' for i in range(res_dim)]
df = pd.DataFrame(truncated_data, columns=columns)

df = downcast_dtypes(df)
df.to_hdf('../data/processed/text/tfidf_truncated-svd.h5', 'table')