In [1]:
import gc
import time
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
# import lightgbm as lgb

In [3]:
NUM_BRANDS = 4004
NUM_CATEGORIES = 1001
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 3

In [4]:
# データタイプを指定
types_dict_train = {'train_id':'int64', 'item_condition_id':'int8', 'price':'float64', 'shipping':'int8'}
types_dict_test = {'test_id':'int64', 'item_condition_id':'int8', 'shipping':'int8'}
# tsvファイルからPandas DataFrameへ読み込み
train = pd.read_csv('./input/train.tsv', delimiter='\t', low_memory=True, dtype=types_dict_train)
test = pd.read_csv('./input/test.tsv', delimiter='\t', low_memory=True, dtype=types_dict_test)


In [5]:
# handle_missing
for df in train, test:
    df['category_name'].fillna(value='missing', inplace=True)
    df['brand_name'].fillna(value='missing', inplace=True)
    df['item_description'].fillna(value='missing', inplace=True)

In [7]:
NUM_BRANDS = 4004
NUM_CATEGORIES = 1001
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 3

In [8]:
train['brand_name'].value_counts()

missing                632682
PINK                    54088
Nike                    54043
Victoria's Secret       48036
LuLaRoe                 31024
Apple                   17322
FOREVER 21              15186
Nintendo                15007
Lululemon               14558
Michael Kors            13928
American Eagle          13254
Rae Dunn                12305
Sephora                 12172
Coach                   10463
Disney                  10360
Bath & Body Works       10354
Adidas                  10202
Funko                    9237
Under Armour             8461
Sony                     7994
Old Navy                 7567
Hollister                6948
Carter's                 6385
Urban Decay              6210
The North Face           6172
Independent              5902
Too Faced                5794
Xbox                     5709
Brandy Melville          5680
Kate Spade               5411
                        ...  
Brandon Thomas              1
Rugged Ridge                1
Focus     

In [9]:
pop_brand = train['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]

In [11]:
len(pop_brand)

4004

In [12]:
train.loc[~train['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'

In [13]:
train['brand_name'].value_counts()

missing                           633487
PINK                               54088
Nike                               54043
Victoria's Secret                  48036
LuLaRoe                            31024
Apple                              17322
FOREVER 21                         15186
Nintendo                           15007
Lululemon                          14558
Michael Kors                       13928
American Eagle                     13254
Rae Dunn                           12305
Sephora                            12172
Coach                              10463
Disney                             10360
Bath & Body Works                  10354
Adidas                             10202
Funko                               9237
Under Armour                        8461
Sony                                7994
Old Navy                            7567
Hollister                           6948
Carter's                            6385
Urban Decay                         6210
The North Face  

In [15]:
pop_category = train['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]

In [16]:
pop_category

Index(['Women/Athletic Apparel/Pants, Tights, Leggings',
       'Women/Tops & Blouses/T-Shirts', 'Beauty/Makeup/Face',
       'Beauty/Makeup/Lips', 'Electronics/Video Games & Consoles/Games',
       'Beauty/Makeup/Eyes',
       'Electronics/Cell Phones & Accessories/Cases, Covers & Skins',
       'Women/Underwear/Bras', 'Women/Tops & Blouses/Tank, Cami',
       'Women/Tops & Blouses/Blouse',
       ...
       'Home/Bath/Bathroom Shelves', 'Men/Suits/One Button',
       'Handmade/Toys/Doll Clothes', 'Handmade/Quilts/Trim',
       'Handmade/Geekery/Housewares', 'Vintage & Collectibles/Supplies/Zipper',
       'Kids/Diapering/Washcloths & Towels', 'Handmade/Furniture/Other',
       'Handmade/Knitting/Doll', 'Handmade/Knitting/Women'],
      dtype='object', length=1287)

In [18]:
train.loc[~train['category_name'].isin(pop_category), 'category_name'] = 'missing'

In [19]:
train['category_name'].value_counts()

Women/Athletic Apparel/Pants, Tights, Leggings                 60177
Women/Tops & Blouses/T-Shirts                                  46380
Beauty/Makeup/Face                                             34335
Beauty/Makeup/Lips                                             29910
Electronics/Video Games & Consoles/Games                       26557
Beauty/Makeup/Eyes                                             25215
Electronics/Cell Phones & Accessories/Cases, Covers & Skins    24676
Women/Underwear/Bras                                           21274
Women/Tops & Blouses/Tank, Cami                                20284
Women/Tops & Blouses/Blouse                                    20284
Women/Dresses/Above Knee, Mini                                 20082
Women/Jewelry/Necklaces                                        19758
Women/Athletic Apparel/Shorts                                  19528
Beauty/Makeup/Makeup Palettes                                  19103
Women/Shoes/Boots                 

In [20]:
# to_categorical
for df in train, test:
    df['category_name'] = df['category_name'].astype('category')
    df['brand_name'] = df['brand_name'].astype('category')
    df['item_condition_id'] = df['item_condition_id'].astype('category')

In [21]:
nrow_train = train.shape[0]

In [22]:
nrow_train

1482535

In [23]:
y = np.log1p(train["price"])

In [24]:
cv = CountVectorizer(min_df=NAME_MIN_DF)
X_name = cv.fit_transform(train['name'])

In [28]:
X_name.shape

(1482535, 17740)

In [29]:
cv = CountVectorizer()
X_category = cv.fit_transform(train['category_name'])

In [30]:
X_category.shape

(1482535, 1012)

In [31]:
tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                         ngram_range=(1, 3),
                         stop_words='english')

In [32]:
X_description = tv.fit_transform(train['item_description'])

In [33]:
X_description.shape

(1482535, 3)

In [34]:
lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(train['brand_name'])

In [35]:
X_brand.shape

(1482535, 4005)

In [37]:
X_dummies = csr_matrix(pd.get_dummies(train[['item_condition_id', 'shipping']],
                                          sparse=True).values)

In [38]:
X_dummies.shape

(1482535, 6)

In [39]:
sparse_merge = hstack((X_dummies, X_description, X_brand, X_category, X_name)).tocsr()

In [40]:
sparse_merge.shape

(1482535, 22766)

In [42]:
X = sparse_merge[:nrow_train]

In [43]:
X.shape

(1482535, 22766)

In [45]:
X_test = sparse_merge[nrow_train:]    

In [46]:
X_test.shape

(0, 22766)

In [49]:
type(X_test)

scipy.sparse.csr.csr_matrix