#### データの全体像
train：1,482,535個のユーザーが投稿した商品

test：693,359行で「価格（Price）」の項目がテストデータは含まれていないため、列数は「7」となっている 

* train_id / test _id – ユーザー投稿のID
* name – 投稿のタイトル。タイトルに価格に関する情報がある場合（例：$20）はメルカリが事前に削除をして[rm]と置き換えている。
* item_condition_id – ユーザーが指定した商品の状態(1~5,大きい方が状態が良い)
* category_name – 投稿カテゴリー
* brand_name – ブランドの名前
* price – 訓練データのみ。実際に売られた価格。米ドル表示。今回のチャレンジの予測ターゲットとなる。
* shipping – 送料のフラグ。「1」は販売者負担。「0」は購入者負担。
* item_description – ユーザーが投稿した商品説明の全文。タイトルと同様に価格情報がある場合は[rm]と置き換えられている。

In [1]:
from datetime import datetime
start_real = datetime.now() # 全体の処理時間の計測を開始する

#### ライブラリのインポート

In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import hstack
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
# from scipy.sparse import save_npz, load_npz

from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error
from joblib import dump

import gc
import warnings
warnings.simplefilter("ignore")

#### データの読み込み

In [3]:
%%time
# Kaggle Notebook
train = pd.read_csv("/kaggle/input/mercari-dataset/train.tsv", delimiter='\t')
test = pd.read_csv("/kaggle/input/mercari-dataset/test.tsv", delimiter='\t')

# Local
# train = pd.read_csv("train.tsv", delimiter='\t')
# test = pd.read_csv("test.tsv", delimiter='\t')

CPU times: user 10.4 s, sys: 1.42 s, total: 11.9 s
Wall time: 17.7 s


In [4]:
print(train.shape)
print(train.info())
display(train.head())

(1482535, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   train_id           1482535 non-null  int64  
 1   name               1482535 non-null  object 
 2   item_condition_id  1482535 non-null  int64  
 3   category_name      1476208 non-null  object 
 4   brand_name         849853 non-null   object 
 5   price              1482535 non-null  float64
 6   shipping           1482535 non-null  int64  
 7   item_description   1482529 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 90.5+ MB
None


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [5]:
print(test.shape)
print(test.info())
display(test.head())

(693359, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693359 entries, 0 to 693358
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   test_id            693359 non-null  int64 
 1   name               693359 non-null  object
 2   item_condition_id  693359 non-null  int64 
 3   category_name      690301 non-null  object
 4   brand_name         397834 non-null  object
 5   shipping           693359 non-null  int64 
 6   item_description   693359 non-null  object
dtypes: int64(3), object(4)
memory usage: 37.0+ MB
None


Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


#### データの前処理

In [6]:
# 3ドル以上2000ドル以下のデータのみを使用する
train = train[(train["price"] >= 3) & (train["price"] <= 2000)]

#### price を対数変換

In [7]:
train['price'] = np.log1p(train['price'])

#### 欠損値処理


In [8]:
print(train.isnull().sum())

train_id                  0
name                      0
item_condition_id         0
category_name          6314
brand_name           632336
price                     0
shipping                  0
item_description          6
dtype: int64


In [9]:
print(test.isnull().sum())

test_id                   0
name                      0
item_condition_id         0
category_name          3058
brand_name           295525
shipping                  0
item_description          0
dtype: int64


In [10]:
# 母数217万点に対して category_name の1万点は少ないため、missing に置き換えず単純に drop
train.dropna(subset=['category_name'], inplace=True)
test.dropna(subset=['category_name'], inplace=True)

#### train, test を行方向に結合し、まとめて前処理できるようにする

In [11]:
y = train['price'] # 目的変数
train.drop(['price'], axis=1, inplace=True)

# 列名を id に変更
train.rename(columns={'train_id': 'id'}, inplace=True)
test.rename(columns={'test_id': 'id'}, inplace=True)

# データフレームを結合
train_test_combine = pd.concat([train,test]) # axis=0 で行、axis=1 で列に結合 デフォルトは axis=0
print(train_test_combine.shape)
train_test_combine.head()

(2165645, 7)


Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,0,Complete with certificate of authenticity


In [12]:
# 欠損値を missing で埋める
# train_test_combine['category_name'].fillna(value='missing', inplace=True)
train_test_combine['brand_name'].fillna(value='missing', inplace=True)
train_test_combine['item_description'].fillna(value='missing', inplace=True)

In [13]:
%%time
# brand_name の重複なしのリストを作成
brand_name_list = set(train_test_combine["brand_name"].values) # set()で重複を削除

def brandfinder(line):
    brand = line[0]
    name = line[1]
    namesplit = name.split(' ')
    
    if brand == 'missing':
        for x in namesplit:
            if x in brand_name_list:            
                return x # name(商品名全体)を返すのではなく、x(ブランド名)のみを返す
    if name in brand_name_list:
        return name
    
    return brand

# ブランド名の付替えを実施
train_test_combine['brand_name'] = train_test_combine[['brand_name','name']].apply(brandfinder, axis = 1) # axis=1で行方向

# brandfinderの中身
train_test_combine[['brand_name','name']].head(10)

CPU times: user 1min 49s, sys: 326 ms, total: 1min 49s
Wall time: 1min 49s


Unnamed: 0,brand_name,name
0,MLB,MLB Cincinnati Reds T Shirt Size XL
1,Razer,Razer BlackWidow Chroma Keyboard
2,Target,AVA-VIV Blouse
3,missing,Leather Horse Statues
4,missing,24K GOLD plated rose
5,missing,Bundled items requested for Ruie
6,Acacia Swimwear,Acacia pacific tides santorini top
7,Soffe,Girls cheer and tumbling bundle of 7
8,Nike,Girls Nike Pro shorts
9,missing,Porcelain clown doll checker pants VTG


#### category_name を3等分する

In [14]:
def split_cat(text):
    if text == 'missing':
        return ("No Label", "No Label", "No Label")
    try:
        return text.split("/")
    except:
        return ("No Label", "No Label", "No Label")

train_test_combine['general_cat'] = train_test_combine['category_name'].apply(lambda x: split_cat(x)[0])
train_test_combine['sub_cat1'] = train_test_combine['category_name'].apply(lambda x: split_cat(x)[1])
train_test_combine['sub_cat2'] = train_test_combine['category_name'].apply(lambda x: split_cat(x)[2])
display(train_test_combine.head())

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,item_description,general_cat,sub_cat1,sub_cat2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,MLB,1,No description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,missing,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,missing,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


In [15]:
# ベクトル化（カウントやTF-IDF）およびエンコーディング（ONE-HOT）の前にメモリをクリアしておく
gc.collect()

0

#### 特徴エンコーディング｜CountVectorizer(), TfidfVectorizer()

In [16]:
%%time
cnt_vec = CountVectorizer()
X_name = cnt_vec.fit_transform(train_test_combine['name'])

tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,3), stop_words='english')
X_descp = tfidf.fit_transform(train_test_combine['item_description'])

print('name vectorization shape:', X_name.shape)
print('item_description vectorization shape:', X_descp.shape)

name vectorization shape: (2165645, 130463)
item_description vectorization shape: (2165645, 50000)
CPU times: user 5min 49s, sys: 8.85 s, total: 5min 58s
Wall time: 5min 57s


#### 特徴エンコーディング｜OneHotEncoder()

In [17]:
%%time
ohe = OneHotEncoder()

# OneHotEncoder は2次元の入力を必要とするため、.reshape(-1, 1) で1次元の Series を2次元に変換
X_brand_name = ohe.fit_transform(train_test_combine['brand_name'].values.reshape(-1, 1))
X_item_condition_id = ohe.fit_transform(train_test_combine['item_condition_id'].values.reshape(-1, 1))
X_shipping = ohe.fit_transform(train_test_combine['shipping'].values.reshape(-1, 1))
X_general_cat = ohe.fit_transform(train_test_combine['general_cat'].values.reshape(-1, 1))
X_sub_cat1 = ohe.fit_transform(train_test_combine['sub_cat1'].values.reshape(-1, 1))
X_sub_cat2 = ohe.fit_transform(train_test_combine['sub_cat2'].values.reshape(-1, 1))

CPU times: user 3.93 s, sys: 81 ms, total: 4.01 s
Wall time: 4.01 s


In [18]:
print('brand encoding shape:', X_brand_name.shape)
print('item condition id encoding shape:', X_item_condition_id.shape)
print('shipping encoding shape:', X_shipping.shape)
print('general_cat encoding shape:', X_general_cat.shape)
print('sub_cat1 encoding shape:', X_sub_cat1.shape)
print('sub_cat2 encoding shape:', X_sub_cat2.shape)

brand encoding shape: (2165645, 5265)
item condition id encoding shape: (2165645, 5)
shipping encoding shape: (2165645, 2)
general_cat encoding shape: (2165645, 10)
sub_cat1 encoding shape: (2165645, 113)
sub_cat2 encoding shape: (2165645, 882)


#### 生成されたスパース行列を結合

In [19]:
combined_matrix_train = (X_name[:len(train)], X_descp[:len(train)], X_brand_name[:len(train)], X_item_condition_id[:len(train)], X_shipping[:len(train)],
                   X_general_cat[:len(train)], X_sub_cat1[:len(train)], X_sub_cat2[:len(train)])
X = hstack(combined_matrix_train).tocsr() # 説明変数

combined_matrix_test = (X_name[len(train):], X_descp[len(train):], X_brand_name[len(train):], X_item_condition_id[len(train):], X_shipping[len(train):],
                   X_general_cat[len(train):], X_sub_cat1[len(train):], X_sub_cat2[len(train):])
test = hstack(combined_matrix_test).tocsr() # テストデータ

In [20]:
print(X.shape) # 説明変数
print(y.shape) # 目的変数
print(test.shape) # テストデータ

(1475344, 186740)
(1475344,)
(690301, 186740)


#### X と y の保存

In [21]:
# %%time
# save_npz('X.npz', X)
# y.to_csv('y.csv', index=False)

#### X と y の読み込み

In [22]:
# Kaggle Notebook
# X = load_npz('/kaggle/input/mercari-preprocessed/X.npz')
# y = pd.read_csv('/kaggle/input/mercari-preprocessed/y.csv')

#### モデルの作成と学習

#### X と y のうち、7割を訓練用 (train)、3割を検証用 (test) に分割する

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [24]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1032740, 186740)
(1032740,)
(442604, 186740)
(442604,)


#### モデルの学習｜LightGBM

In [25]:
# データセットの形式を変更
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [26]:
# ハイパーパラメータの設定
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 15,
    'learning_rate': 1.0,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.6,
    'bagging_freq': 5,
    'verbosity': 1,
    'num_boost_round': 200,
    'early_stopping_rounds': 50,
    'seed': 0
}

In [27]:
%%time
gbm = lgb.train(params, train_data, valid_sets=valid_data)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 733.927786 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1067750
[LightGBM] [Info] Number of data points in the train set: 1032740, number of used features: 65802
[LightGBM] [Info] Start training from score 2.981379
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[200]	valid_0's rmse: 0.488143
CPU times: user 2h 58min 3s, sys: 16.1 s, total: 2h 58min 19s
Wall time: 50min 26s


In [64]:
# モデルの保存
tag = 'lgb'
model_file = '{}_mercari.joblib'.format(tag)
dump(gbm, model_file)

['lgb_mercari.joblib']

#### モデルの学習｜Ridge回帰

In [29]:
%%time
ba = 4.3873371550240865
ridge = Ridge(alpha=ba, fit_intercept=True, solver='lsqr', random_state=0)
ridge.fit(X_train, y_train)

CPU times: user 2min 51s, sys: 1min 54s, total: 4min 45s
Wall time: 1min 31s


#### モデルの学習｜スタッキング

In [56]:
%%time
# スタッキング用の予測を取得
gbm_train_pred = gbm.predict(X_train)
gbm_test_pred = gbm.predict(X_test)
ridge_train_pred = br.predict(X_train)
ridge_test_pred = br.predict(X_test)

# スタッキングデータセットの作成
X_meta_train = np.column_stack((gbm_train_pred, ridge_train_pred))
X_meta_test = np.column_stack((gbm_test_pred, ridge_test_pred))

CPU times: user 3min 22s, sys: 156 ms, total: 3min 23s
Wall time: 52.2 s


In [57]:
%%time
# メタモデルの学習
mm = Ridge(alpha=ba, random_state=0)  # メタモデルとしてRidgeを使用
mm.fit(X_meta_train, y_train)

CPU times: user 59 ms, sys: 1.96 ms, total: 61 ms
Wall time: 53.7 ms


#### 訓練データに対する予測｜LightGBM

In [66]:
y_train_pred = gbm.predict(X_train)

In [67]:
# RMSLEを計算する関数
def rmsle(y_true, y_pred):  
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [68]:
lgb_train_rmsle = rmsle(np.expm1(y_train), np.expm1(y_train_pred))
print("RMSLE for train data:", lgb_train_rmsle)

RMSLE for train data: 0.48035921884709926


#### テストデータに対する予測｜LightGBM

In [69]:
y_test_pred = gbm.predict(X_test)

In [70]:
lgb_test_rmsle = rmsle(np.expm1(y_test), np.expm1(y_test_pred))
print("RMSLE for test data:", lgb_test_rmsle)

RMSLE for test data: 0.4881428696741681


#### 訓練データに対する予測｜Ridge回帰

In [37]:
y_train_pred = ridge.predict(X_train)

In [38]:
ridge_train_rmsle = rmsle(np.expm1(y_train), np.expm1(y_train_pred))
print("RMSLE for train data:", ridge_train_rmsle)

RMSLE for train data: 0.4307860204712816


#### テストデータに対する予測｜Ridge回帰

In [39]:
y_test_pred = ridge.predict(X_test)

In [40]:
ridge_test_rmsle = rmsle(np.expm1(y_test), np.expm1(y_test_pred))
print("RMSLE for test data:", ridge_test_rmsle)

RMSLE for test data: 0.4596450671927904


#### 訓練データに対する予測｜スタッキング

In [41]:
y_train_pred = y_train_pred = mm.predict(X_meta_train)

In [42]:
mm_train_rmsle = rmsle(np.expm1(y_train), np.expm1(y_train_pred))
print("RMSLE for train data:", mm_train_rmsle)

RMSLE for train data: 0.4296918563067199


#### テストデータに対する予測｜スタッキング

In [43]:
y_test_pred = y_train_pred = mm.predict(X_meta_test)

In [44]:
mm_test_rmsle = rmsle(np.expm1(y_test), np.expm1(y_test_pred))
print("RMSLE for test data:", mm_test_rmsle)

RMSLE for test data: 0.45744538512360433


#### 訓練データに対する予測｜単純平均

In [45]:
y_train_pred = (gbm.predict(X_train) + ridge.predict(X_train)) / 2

In [46]:
ens_ave_train_rmsle = rmsle(np.expm1(y_train), np.expm1(y_train_pred))
print("RMSLE for train data:", ens_ave_train_rmsle)

RMSLE for train data: 0.44095265664848343


#### テストデータに対する予測｜単純平均

In [47]:
y_test_pred = (gbm.predict(X_test) + ridge.predict(X_test)) / 2

In [48]:
ens_ave_test_rmsle = rmsle(np.expm1(y_test), np.expm1(y_test_pred))
print("RMSLE for test data:", ens_ave_test_rmsle)

RMSLE for test data: 0.45893794311422814


#### 訓練データに対する予測｜重み付き平均

In [73]:
y_train_pred = 0.4*gbm.predict(X_train) + 0.6*ridge.predict(X_train)

In [74]:
ens_wave_train_rmsle = rmsle(np.expm1(y_train), np.expm1(y_train_pred))
print("RMSLE for train data:", ens_wave_train_rmsle)

RMSLE for train data: 0.4364301418447715


#### テストデータに対する予測｜重み付き平均

In [75]:
y_test_pred = 0.4*gbm.predict(X_test) + 0.6*ridge.predict(X_test)

In [76]:
ens_wave_test_rmsle = rmsle(np.expm1(y_test), np.expm1(y_test_pred))
print("RMSLE for test data:", ens_wave_test_rmsle)

RMSLE for test data: 0.4566062065313864


#### 訓練データに対するRMSLEの比較

In [77]:
print(f"lgb_train_rmsle: {lgb_train_rmsle:.3f}")
print(f"ridge_train_rmsle: {ridge_train_rmsle:.3f}")
print(f"mm_train_rmsle: {mm_train_rmsle:.3f}")
print(f"ens_ave_train_rmsle: {ens_ave_train_rmsle:.3f}")
print(f"ens_weighted_ave_train_rmsle: {ens_wave_train_rmsle:.3f}")

lgb_train_rmsle: 0.480
ridge_train_rmsle: 0.431
mm_train_rmsle: 0.430
ens_ave_train_rmsle: 0.441
ens_weighted_ave_train_rmsle: 0.436


#### テストデータに対するRMSLEの比較

In [78]:
print(f"lgb_test_rmsle: {lgb_test_rmsle:.3f}")
print(f"ridge_test_rmsle: {ridge_test_rmsle:.3f}")
print(f"mm_test_rmsle: {mm_test_rmsle:.3f}")
print(f"ens_ave_test_rmsle: {ens_ave_test_rmsle:.3f}")
print(f"ens_weighted_ave_test_rmsle: {ens_wave_test_rmsle:.3f}")

lgb_test_rmsle: 0.488
ridge_test_rmsle: 0.460
mm_test_rmsle: 0.457
ens_ave_test_rmsle: 0.459
ens_weighted_ave_test_rmsle: 0.457


In [55]:
stop_real = datetime.now()
execution_time_real = stop_real - start_real
print(execution_time_real)

1:06:43.357953
