In [1]:
from datetime import datetime
start_real = datetime.now() # 全体の処理時間の計測を開始する

import pandas as pd
import numpy as np

import warnings
warnings.simplefilter("ignore")

#### データの読み込み

In [2]:
train = pd.read_csv("/kaggle/input/mercari-dataset/train.tsv", delimiter='\t')
test = pd.read_csv("/kaggle/input/mercari-dataset/test.tsv", delimiter='\t')

#### データの全体像
train：1,482,535個のユーザーが投稿した商品

test：693,359行で「価格（Price）」の項目がテストデータは含まれていないため、列数は「7」となっている 

* train_id / test _id – ユーザー投稿のID
* name – 投稿のタイトル。タイトルに価格に関する情報がある場合（例：$20）はメルカリが事前に削除をして[rm]と置き換えている。
* item_condition_id – ユーザーが指定した商品の状態(1~5,大きい方が状態が良い)
* category_name – 投稿カテゴリー
* brand_name – ブランドの名前
* price – 訓練データのみ。実際に売られた価格。米ドル表示。今回のチャレンジの予測ターゲットとなる。
* shipping – 送料のフラグ。「1」は販売者負担。「0」は購入者負担。
* item_description – ユーザーが投稿した商品説明の全文。タイトルと同様に価格情報がある場合は[rm]と置き換えられている。

In [3]:
print(train.shape)
print(train.info())
display(train.head())

(1482535, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   train_id           1482535 non-null  int64  
 1   name               1482535 non-null  object 
 2   item_condition_id  1482535 non-null  int64  
 3   category_name      1476208 non-null  object 
 4   brand_name         849853 non-null   object 
 5   price              1482535 non-null  float64
 6   shipping           1482535 non-null  int64  
 7   item_description   1482529 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 90.5+ MB
None


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [4]:
print(test.shape)
print(test.info())
display(test.head())

(693359, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693359 entries, 0 to 693358
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   test_id            693359 non-null  int64 
 1   name               693359 non-null  object
 2   item_condition_id  693359 non-null  int64 
 3   category_name      690301 non-null  object
 4   brand_name         397834 non-null  object
 5   shipping           693359 non-null  int64 
 6   item_description   693359 non-null  object
dtypes: int64(3), object(4)
memory usage: 37.0+ MB
None


Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


#### ----------------- price -----------------

#### 3ドル未満のレコードを削除

In [5]:
train = train.drop(train[(train.price < 3.0)].index)
print(train.shape)
print(train["price"].min()) #priceの最小値
print(train["price"].max()) #priceの最大値

(1481661, 8)
3.0
2009.0


1,482,535→1,481,661点となり、874の商品が削除された

#### priceを対数変換して正規化

In [6]:
# 訓練データの'price'を対数変換する
train['price'] = np.log1p(train['price'])

#### ----------------- category_name -----------------

#### general_category, sub_category1, sub_category2 に3等分する

In [7]:
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label") #categoryがない場合はNo Labelを返す

train['general_cat'], train['sub_cat1'], train['sub_cat2'] = zip(*train['category_name'].apply(lambda x: split_cat(x))) #zip(*df)で転置
train.head()
test['general_cat'], test['sub_cat1'], test['sub_cat2'] = zip(*test['category_name'].apply(lambda x: split_cat(x)))
test.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description,general_cat,sub_cat1,sub_cat2
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7,Women,Jewelry,Rings
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined...",Other,Office supplies,Shipping Supplies
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...,Vintage & Collectibles,Bags and Purses,Handbag
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...,Women,Sweaters,Cardigan
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...,Other,Books,Religion & Spirituality


#### train,testを縦方向に結合し、まとめて前処理できるようにする
> 変更点  
> trainデータフレームとtestデータフレームをpd.concat([train, test])で結合すると、1列目の列名が異なるため、test_id列が末尾にNaNとして挿入されてしまう。  
> これを避けるために、一度train_idとtest_idをidに変更し、列名を統一してから結合する。

In [8]:
# 列名をidに変更
train.rename(columns={'train_id': 'id'}, inplace=True) # <---
test.rename(columns={'test_id': 'id'}, inplace=True) # <---

# データフレームを結合
train_test_combine = pd.concat([train,test]) #axis=0で行、axis=1で列に結合 デフォルトはaxis=0
print(train_test_combine.shape)
train_test_combine.head()

(2175020, 11)


Unnamed: 0,id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,general_cat,sub_cat1,sub_cat2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,2.397895,1,No description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,3.970292,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,2.397895,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,3.583519,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,3.806662,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


train：1,482,535個の商品  
test：693,359個の商品  
train_test_combine：2,175,894個の商品

#### ----------------- brand_name -----------------

#### 欠損値処理
1. ブランド名の欠損値(NaN)を"missing"に置き換える
2. このままだと欠損値であることに変わりないので、"name(商品名)"が、ブランド名のリストに含まれる場合、ブランド名の"missing"を"name(商品名)"に書き換える

これにより、ブランド名(brand_name)が記載されていない商品でも、商品名(name)からブランド名を取得し、代入することができる

In [9]:
#brand_nameの重複なしのリストを作成
brand_name_list = set(train_test_combine["brand_name"].values) #set()で重複を削除

# 'brand_name'の欠損値NaNを'missing'に置き換える
train['brand_name'].fillna(value='missing', inplace=True)
test['brand_name'].fillna(value='missing', inplace=True)

# 訓練データの'brand_name'が'missing'に一致するレコード数を取得
train_premissing = len(train.loc[train['brand_name'] == 'missing'])
# テストデータの'brand_name'が'missing'に一致するレコード数を取得
test_premissing = len(test.loc[test['brand_name'] == 'missing'])

train_premissing, test_premissing #欠損値の数

(632336, 295525)

brand_name(trainデータ): 63万点のうち、13.7万点がブランド名を表すデータに書き換えられた  
brand_name(testデータ): 29万点のうち、6.4万点がブランド名を表すデータに書き換えられた

In [10]:
%%time
def brandfinder(line):
    brand = line[0] # 第1要素はブランド名
    name = line[1]  # 第2要素は商品名
    namesplit = name.split(' ') # 商品名をスペースで切り分ける
    
    if brand == 'missing':  # ブランド名が'missing'の場合
        for x in namesplit: # 商品名から切り分けた単語を取り出す
            if x in brand_name_list: # 単語がブランドリストに存在すればブランド名を返す              
                return name # 単語がブランドリストに一致したら商品名を返す
    if name in brand_name_list:  # 商品名がブランドリストに存在すれば商品名を返す
        return name
    
    return brand            # どれにも一致しなければブランド名を返す

# ブランド名の付替えを実施
train['brand_name'] = train[['brand_name','name']].apply(brandfinder, axis = 1) #axis=1で行方向
test['brand_name'] = test[['brand_name','name']].apply(brandfinder, axis = 1)

# 書き換えられた'missing'の数を取得
train_found = train_premissing-len(train.loc[train['brand_name'] == 'missing'])
test_found = test_premissing-len(test.loc[test['brand_name'] == 'missing'])

print(train_premissing) # 書き換える前の'missing'の数
print(train_found)      # 書き換えられた'missing'の数
print(test_premissing)  # 書き換える前の'missing'の数
print(test_found)       # 書き換えられた'missing'の数

632336
137342
295525
64154
CPU times: user 1min 46s, sys: 294 ms, total: 1min 47s
Wall time: 1min 47s


In [11]:
train.head()

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,general_cat,sub_cat1,sub_cat2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,MLB Cincinnati Reds T Shirt Size XL,2.397895,1,No description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,3.970292,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,2.397895,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,missing,3.583519,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,missing,3.806662,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


たとえば一行目の "brand_name" は missing → MLB Cincinnati となり、missingが"name(商品名)" に書き換えられたことがわかる

#### ----------------- エンコーディング (文字列→数値データ) -----------------

- name
- category_name
- brand_name
- item_description

の欠損値(NaN)を"missing"で埋める

In [12]:
#fillna()で欠損値を埋める
train_test_combine['name'].fillna(value='missing', inplace=True)
train_test_combine['category_name'].fillna(value='missing', inplace=True)
train_test_combine['general_cat'].fillna(value='missing', inplace=True)
train_test_combine['sub_cat1'].fillna(value='missing', inplace=True)
train_test_combine['sub_cat2'].fillna(value='missing', inplace=True)
train_test_combine['brand_name'].fillna(value='missing', inplace=True)
train_test_combine['item_description'].fillna(value='missing', inplace=True)

カテゴリカルデータ（離散的なカテゴリを持つデータ）である  
- category_name
- general_cat
- sub_cat1
- sub_cat2
- brand_name

を数値に変換

In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

#カテゴリ変数を数値に変換
# train_test_combine['name'] = le.fit_transform(train_test_combine['name'])
train_test_combine['category_name'] = le.fit_transform(train_test_combine['category_name'])
train_test_combine['general_cat'] = le.fit_transform(train_test_combine['general_cat'])
train_test_combine['sub_cat1'] = le.fit_transform(train_test_combine['sub_cat1'])
train_test_combine['sub_cat2'] = le.fit_transform(train_test_combine['sub_cat2'])
train_test_combine['brand_name'] = le.fit_transform(train_test_combine['brand_name'])
# train_test_combine['item_description'] = le.fit_transform(train_test_combine['item_description'])

display(train_test_combine.head())

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,general_cat,sub_cat1,sub_cat2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,829,5263,2.397895,1,No description yet,5,103,774
1,1,Razer BlackWidow Chroma Keyboard,3,86,3887,3.970292,0,This keyboard is in great condition and works ...,1,30,215
2,2,AVA-VIV Blouse,1,1277,4586,2.397895,1,Adorable top with a hint of lace and a key hol...,10,104,97
3,3,Leather Horse Statues,1,503,5263,3.583519,1,New with tags. Leather horses. Retail for [rm]...,3,55,410
4,4,24K GOLD plated rose,1,1204,5263,3.806662,0,Complete with certificate of authenticity,10,58,542


#### name, item_descriptionをエンコード(自然言語処理)
name, item_descriptionは複数の単語で構成されるので、単純なエンコード(数値変換)ではなく、KerasのTokenizerクラスを使ってエンコード

Tokenizerとは

* 英文のテキストデータをトークン(最小単位)に分解する
* 各トークンに通し番号(インデックス)を割り振ることで実数ベクトルに変換する

分かりやすいzenn記事：https://zenn.dev/robes/articles/b6708032855a9c

In [14]:
%%time
from tensorflow.keras.preprocessing.text import Tokenizer

print("Transforming text data to sequences...")
raw_text = np.hstack(
    [train_test_combine.item_description.str.lower(), # 説明文
     train_test_combine.name.str.lower()]           # 商品名
)
print('sequences shape', raw_text.shape)

# 説明文、商品名、カテゴリ名を連結した配列でTokenizerを作る
print("   Fitting tokenizer...")
tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)

# Tokenizerで説明文、商品名をそれぞれラベルエンコードする
print("   Transforming text to sequences...")
train_test_combine['seq_item_description'] = tok_raw.texts_to_sequences(train_test_combine.item_description.str.lower())
train_test_combine['seq_name'] = tok_raw.texts_to_sequences(train_test_combine.name.str.lower())

del tok_raw #delは変数を削除する

print(train_test_combine.seq_item_description.head())
print(train_test_combine.seq_name.head())

Transforming text data to sequences...
sequences shape (4350040,)
   Fitting tokenizer...
   Transforming text to sequences...
0                                         [12, 68, 79]
1    [29, 2666, 10, 7, 38, 17, 1, 206, 51, 19, 1099...
2    [597, 60, 9, 4, 5351, 11, 192, 1, 4, 900, 1299...
3    [5, 9, 61, 178, 6621, 229, 3, 21, 166, 1086, 2...
4                             [801, 9, 7022, 11, 2013]
Name: seq_item_description, dtype: object
0    [3882, 8986, 6977, 208, 84, 6, 155]
1            [11402, 27914, 17350, 2666]
2                     [7811, 10858, 666]
3                     [178, 2603, 14008]
4                 [4962, 103, 1032, 280]
Name: seq_name, dtype: object
CPU times: user 4min 11s, sys: 3.05 s, total: 4min 14s
Wall time: 4min 19s


#### 0でパディング(埋める)して配列の長さを揃える

自然言語処理の学習における入力データは固定長化する必要がある

参考サイト；https://moneyforward-dev.jp/entry/2021/10/05/transformers-tokenizer/

In [15]:
%%time
from keras.preprocessing.sequence import pad_sequences
print(pad_sequences(train_test_combine.seq_item_description, maxlen=80),'\n') # 商品説明
print(pad_sequences(train_test_combine.seq_name, maxlen=10))                  # 商品名

[[    0     0     0 ...    12    68    79]
 [    0     0     0 ...    14    49   996]
 [    0     0     0 ...   175     7    59]
 ...
 [    0     0     0 ...    20 63502    88]
 [    0     0     0 ...     4   369    71]
 [    0     0     0 ...     4   593   635]] 

[[    0     0     0 ...    84     6   155]
 [    0     0     0 ... 27914 17350  2666]
 [    0     0     0 ...  7811 10858   666]
 ...
 [    0     0     0 ...   401  1559    88]
 [    0     0     0 ...   325   281  2230]
 [    0     0     0 ...   892   725    69]]
CPU times: user 24 s, sys: 216 ms, total: 24.2 s
Wall time: 24.2 s


In [16]:
train_test_combine.head()

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,general_cat,sub_cat1,sub_cat2,seq_item_description,seq_name
0,0,MLB Cincinnati Reds T Shirt Size XL,3,829,5263,2.397895,1,No description yet,5,103,774,"[12, 68, 79]","[3882, 8986, 6977, 208, 84, 6, 155]"
1,1,Razer BlackWidow Chroma Keyboard,3,86,3887,3.970292,0,This keyboard is in great condition and works ...,1,30,215,"[29, 2666, 10, 7, 38, 17, 1, 206, 51, 19, 1099...","[11402, 27914, 17350, 2666]"
2,2,AVA-VIV Blouse,1,1277,4586,2.397895,1,Adorable top with a hint of lace and a key hol...,10,104,97,"[597, 60, 9, 4, 5351, 11, 192, 1, 4, 900, 1299...","[7811, 10858, 666]"
3,3,Leather Horse Statues,1,503,5263,3.583519,1,New with tags. Leather horses. Retail for [rm]...,3,55,410,"[5, 9, 61, 178, 6621, 229, 3, 21, 166, 1086, 2...","[178, 2603, 14008]"
4,4,24K GOLD plated rose,1,1204,5263,3.806662,0,Complete with certificate of authenticity,10,58,542,"[801, 9, 7022, 11, 2013]","[4962, 103, 1032, 280]"


#### name, item_descriptionの単語数をカウントした列を追加

> 変更点  
> - **正規表現によるクリーニングの追加**: テキストから句読点や数字、改行、タブ文字を削除し、クリーンなテキストを生成することで、単語数のカウントをより正確にする。  
> - **ストップワードの除去の追加**: 一般的な英語のストップワード（情報検索の際に無視される単語）を除去し、実際の情報量に基づいた単語数をカウントする。また、3文字未満の単語も除外することで、より有意義な単語のみを対象とする。

In [17]:
%%time
import re # <---
import string # <---
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS # <---

def wordCount(text):
    """
    Parameters:
      text(str): 商品名、商品の説明文
    """
    try:
        if text == 'No description yet':
            return 1  # <--- 商品名や説明が'No description yet'の場合は1を返す
        else:
            text = text.lower()  # <--- すべて小文字にする
            # <--- 正規表現を使用して句読点や数字を削除
            regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
            text = regex.sub(" ", text)
            # <--- ストップワードを除去
            words = [w for w in text.split() if w and w not in ENGLISH_STOP_WORDS and len(w) >= 3]
            return len(words)  # 単語の数を返す
    except: 
        return 0


# 'name'の各フィールドの単語数を'name_len'に登録
train_test_combine['name_len'] = train_test_combine['name'].apply(lambda x: wordCount(x))
# 'item_description'の各フィールドの単語数を'desc_len'に登録
train_test_combine['desc_len'] = train_test_combine['item_description'].apply(lambda x: wordCount(x))

CPU times: user 1min 7s, sys: 13.5 ms, total: 1min 7s
Wall time: 1min 7s


In [18]:
print(train_test_combine.shape)
print(train_test_combine.info())
display(train_test_combine.head())

(2175020, 15)
<class 'pandas.core.frame.DataFrame'>
Index: 2175020 entries, 0 to 693358
Data columns (total 15 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   name                  object 
 2   item_condition_id     int64  
 3   category_name         int64  
 4   brand_name            int64  
 5   price                 float64
 6   shipping              int64  
 7   item_description      object 
 8   general_cat           int64  
 9   sub_cat1              int64  
 10  sub_cat2              int64  
 11  seq_item_description  object 
 12  seq_name              object 
 13  name_len              int64  
 14  desc_len              int64  
dtypes: float64(1), int64(10), object(4)
memory usage: 265.5+ MB
None


Unnamed: 0,id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,general_cat,sub_cat1,sub_cat2,seq_item_description,seq_name,name_len,desc_len
0,0,MLB Cincinnati Reds T Shirt Size XL,3,829,5263,2.397895,1,No description yet,5,103,774,"[12, 68, 79]","[3882, 8986, 6977, 208, 84, 6, 155]",5,1
1,1,Razer BlackWidow Chroma Keyboard,3,86,3887,3.970292,0,This keyboard is in great condition and works ...,1,30,215,"[29, 2666, 10, 7, 38, 17, 1, 206, 51, 19, 1099...","[11402, 27914, 17350, 2666]",4,16
2,2,AVA-VIV Blouse,1,1277,4586,2.397895,1,Adorable top with a hint of lace and a key hol...,10,104,97,"[597, 60, 9, 4, 5351, 11, 192, 1, 4, 900, 1299...","[7811, 10858, 666]",3,9
3,3,Leather Horse Statues,1,503,5263,3.583519,1,New with tags. Leather horses. Retail for [rm]...,3,55,410,"[5, 9, 61, 178, 6621, 229, 3, 21, 166, 1086, 2...","[178, 2603, 14008]",3,17
4,4,24K GOLD plated rose,1,1204,5263,3.806662,0,Complete with certificate of authenticity,10,58,542,"[801, 9, 7022, 11, 2013]","[4962, 103, 1032, 280]",3,3


In [19]:
# 欠損値が残っていないか確認
print(train_test_combine.isnull().sum())

id                           0
name                         0
item_condition_id            0
category_name                0
brand_name                   0
price                   693359
shipping                     0
item_description             0
general_cat                  0
sub_cat1                     0
sub_cat2                     0
seq_item_description         0
seq_name                     0
name_len                     0
desc_len                     0
dtype: int64


In [20]:
print(train_test_combine.shape)
print(train_test_combine.info())
display(train_test_combine.head())

(2175020, 15)
<class 'pandas.core.frame.DataFrame'>
Index: 2175020 entries, 0 to 693358
Data columns (total 15 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   name                  object 
 2   item_condition_id     int64  
 3   category_name         int64  
 4   brand_name            int64  
 5   price                 float64
 6   shipping              int64  
 7   item_description      object 
 8   general_cat           int64  
 9   sub_cat1              int64  
 10  sub_cat2              int64  
 11  seq_item_description  object 
 12  seq_name              object 
 13  name_len              int64  
 14  desc_len              int64  
dtypes: float64(1), int64(10), object(4)
memory usage: 265.5+ MB
None


Unnamed: 0,id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,general_cat,sub_cat1,sub_cat2,seq_item_description,seq_name,name_len,desc_len
0,0,MLB Cincinnati Reds T Shirt Size XL,3,829,5263,2.397895,1,No description yet,5,103,774,"[12, 68, 79]","[3882, 8986, 6977, 208, 84, 6, 155]",5,1
1,1,Razer BlackWidow Chroma Keyboard,3,86,3887,3.970292,0,This keyboard is in great condition and works ...,1,30,215,"[29, 2666, 10, 7, 38, 17, 1, 206, 51, 19, 1099...","[11402, 27914, 17350, 2666]",4,16
2,2,AVA-VIV Blouse,1,1277,4586,2.397895,1,Adorable top with a hint of lace and a key hol...,10,104,97,"[597, 60, 9, 4, 5351, 11, 192, 1, 4, 900, 1299...","[7811, 10858, 666]",3,9
3,3,Leather Horse Statues,1,503,5263,3.583519,1,New with tags. Leather horses. Retail for [rm]...,3,55,410,"[5, 9, 61, 178, 6621, 229, 3, 21, 166, 1086, 2...","[178, 2603, 14008]",3,17
4,4,24K GOLD plated rose,1,1204,5263,3.806662,0,Complete with certificate of authenticity,10,58,542,"[801, 9, 7022, 11, 2013]","[4962, 103, 1032, 280]",3,3


#### 前処理後の結合したデータ(train_test_combine)を再びtrain,testデータに分割する
> 変更点  
> idとした列名を、再びtrain_id, test_idに戻す

In [21]:
#train_test_combineからtrainとtestに分割
train = train_test_combine[:len(train)]
train.rename(columns={'id': 'train_id'}, inplace=True) # <---

test = train_test_combine[len(train):]
test.rename(columns={'id': 'test_id'}, inplace=True) # <---


In [22]:
print(train.shape)
print(train.info())
display(train.head())

(1481661, 15)
<class 'pandas.core.frame.DataFrame'>
Index: 1481661 entries, 0 to 1482534
Data columns (total 15 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   train_id              1481661 non-null  int64  
 1   name                  1481661 non-null  object 
 2   item_condition_id     1481661 non-null  int64  
 3   category_name         1481661 non-null  int64  
 4   brand_name            1481661 non-null  int64  
 5   price                 1481661 non-null  float64
 6   shipping              1481661 non-null  int64  
 7   item_description      1481661 non-null  object 
 8   general_cat           1481661 non-null  int64  
 9   sub_cat1              1481661 non-null  int64  
 10  sub_cat2              1481661 non-null  int64  
 11  seq_item_description  1481661 non-null  object 
 12  seq_name              1481661 non-null  object 
 13  name_len              1481661 non-null  int64  
 14  desc_len              148

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,general_cat,sub_cat1,sub_cat2,seq_item_description,seq_name,name_len,desc_len
0,0,MLB Cincinnati Reds T Shirt Size XL,3,829,5263,2.397895,1,No description yet,5,103,774,"[12, 68, 79]","[3882, 8986, 6977, 208, 84, 6, 155]",5,1
1,1,Razer BlackWidow Chroma Keyboard,3,86,3887,3.970292,0,This keyboard is in great condition and works ...,1,30,215,"[29, 2666, 10, 7, 38, 17, 1, 206, 51, 19, 1099...","[11402, 27914, 17350, 2666]",4,16
2,2,AVA-VIV Blouse,1,1277,4586,2.397895,1,Adorable top with a hint of lace and a key hol...,10,104,97,"[597, 60, 9, 4, 5351, 11, 192, 1, 4, 900, 1299...","[7811, 10858, 666]",3,9
3,3,Leather Horse Statues,1,503,5263,3.583519,1,New with tags. Leather horses. Retail for [rm]...,3,55,410,"[5, 9, 61, 178, 6621, 229, 3, 21, 166, 1086, 2...","[178, 2603, 14008]",3,17
4,4,24K GOLD plated rose,1,1204,5263,3.806662,0,Complete with certificate of authenticity,10,58,542,"[801, 9, 7022, 11, 2013]","[4962, 103, 1032, 280]",3,3


In [23]:
print(test.shape)
print(test.info())
display(test.head())

(693359, 15)
<class 'pandas.core.frame.DataFrame'>
Index: 693359 entries, 0 to 693358
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   test_id               693359 non-null  int64  
 1   name                  693359 non-null  object 
 2   item_condition_id     693359 non-null  int64  
 3   category_name         693359 non-null  int64  
 4   brand_name            693359 non-null  int64  
 5   price                 0 non-null       float64
 6   shipping              693359 non-null  int64  
 7   item_description      693359 non-null  object 
 8   general_cat           693359 non-null  int64  
 9   sub_cat1              693359 non-null  int64  
 10  sub_cat2              693359 non-null  int64  
 11  seq_item_description  693359 non-null  object 
 12  seq_name              693359 non-null  object 
 13  name_len              693359 non-null  int64  
 14  desc_len              693359 non-null  int64

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,general_cat,sub_cat1,sub_cat2,seq_item_description,seq_name,name_len,desc_len
0,0,"Breast cancer ""I fight like a girl"" ring",1,1205,5263,,1,Size 7,10,58,667,"[6, 64]","[1789, 3475, 15, 5555, 51, 4, 266, 263]",6,1
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,899,5263,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined...",7,72,701,"[439, 802, 5, 64, 34, 9038, 5187, 616, 1024, 7...","[439, 802, 5, 64, 34, 9038, 5187, 616, 1024]",5,30
2,2,Coach bag,1,1000,1093,,1,Brand new coach bag. Bought for [rm] at a Coac...,9,7,382,"[16, 5, 315, 101, 201, 3, 21, 107, 4, 315, 3689]","[315, 101]",2,7
3,3,Floral Kimono,2,1256,5263,,0,-floral kimono -never worn -lightweight and pe...,10,97,166,"[281, 2230, 37, 30, 577, 1, 93, 3, 350, 1844]","[281, 2230]",2,7
4,4,Life after Death,3,860,5263,,1,Rediscovering life after the loss of a loved o...,7,14,662,"[98827, 473, 286, 2, 1504, 11, 4, 1093, 46, 10...","[473, 286, 5470]",2,16


#### 前処理後データの保存

In [24]:
# train.to_csv('train_preprocessed.csv', index=False)
# test.to_csv('test_preprocessed.csv', index=False)

#### -------------------- 前処理終了 --------------------

#### モデルの作成と学習

#### 説明変数と目的変数の数値を分ける
> 変更点  
> エンコーディングする前の列は説明変数に加えない。  
> 情報の重複と多重共線性のリスクを避けるためである。エンコーディングされたデータは元のテキストデータの情報を包含しているため、元の列を残すと過剰な情報がモデルの性能を低下させる可能性がある。

In [25]:
X = train.drop(columns=['train_id', 'name', 'category_name', 'price', 'item_description']) # <---
y = train['price']
print('X:', X.shape)
display(X.head())
print('y:', y.shape)
print(y.head())

X: (1481661, 10)


Unnamed: 0,item_condition_id,brand_name,shipping,general_cat,sub_cat1,sub_cat2,seq_item_description,seq_name,name_len,desc_len
0,3,5263,1,5,103,774,"[12, 68, 79]","[3882, 8986, 6977, 208, 84, 6, 155]",5,1
1,3,3887,0,1,30,215,"[29, 2666, 10, 7, 38, 17, 1, 206, 51, 19, 1099...","[11402, 27914, 17350, 2666]",4,16
2,1,4586,1,10,104,97,"[597, 60, 9, 4, 5351, 11, 192, 1, 4, 900, 1299...","[7811, 10858, 666]",3,9
3,1,5263,1,3,55,410,"[5, 9, 61, 178, 6621, 229, 3, 21, 166, 1086, 2...","[178, 2603, 14008]",3,17
4,1,5263,0,10,58,542,"[801, 9, 7022, 11, 2013]","[4962, 103, 1032, 280]",3,3


y: (1481661,)
0    2.397895
1    3.970292
2    2.397895
3    3.583519
4    3.806662
Name: price, dtype: float64


#### Xとyのうち、7割を訓練用(train)、3割を検証用(test)に分割する

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [27]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1037162, 10)
(1037162,)
(444499, 10)
(444499,)


#### モデルの学習

In [28]:
%%time
import lightgbm as lgb

lgbm = lgb.LGBMRegressor(n_estimators=200, max_depth=10, random_state=0, n_jobs=-1)
lgbm.fit(X_train, y_train)

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: seq_item_description: object, seq_name: object

#### モデルの特徴重要度

In [None]:
ser_fi = pd.Series(lgbm.feature_importances_, index=X_train.columns)
print(ser_fi.sort_values(ascending=False))

In [None]:
plt.bar(X_train.columns, lgbm.feature_importances_)
plt.ylabel('Importance')
plt.xticks(rotation=90)
plt.show()

#### 訓練データに対する予測

In [None]:
y_train_pred = lgbm.predict(X_train)

In [None]:
y_min = np.append(y_train_pred, y_train).min()
y_max = np.append(y_train_pred, y_train).max()
y_margin = (y_max - y_min) * 0.1
y_min -= y_margin
y_max += y_margin

In [None]:
plt.scatter(y_train_pred, y_train, alpha=0.3)
plt.plot([y_min,y_max], [y_min,y_max])
plt.gca().set_aspect('equal', adjustable='box')
plt.xlim(y_min, y_max)
plt.ylim(y_min, y_max)
plt.title('Train data')
plt.xlabel('Pred')
plt.ylabel('True')
plt.show()

In [None]:
plt.scatter(y_train_pred, y_train, alpha=0.3)
plt.plot([y_min,y_max], [y_min,y_max])
plt.gca().set_aspect('equal', adjustable='box')
plt.xlim(0, 400) # <---
plt.ylim(0, 400) # <---
plt.title('Train data')
plt.xlabel('Pred')
plt.ylabel('True')
plt.show()

In [None]:
from sklearn.metrics import mean_squared_log_error

# RMSLE関数の定義
def rmsle(y_true, y_pred):
    # クリップして無限大や非常に大きな値を避ける
    y_true_clipped = np.clip(y_true, 0, np.log(np.finfo(np.float64).max))
    y_pred_clipped = np.clip(y_pred, 0, np.log(np.finfo(np.float64).max))
    
    return np.sqrt(mean_squared_log_error(y_true_clipped, y_pred_clipped))

In [None]:
# モデルの評価
print("RMSLE for train data:", rmsle(np.expm1(y_train), np.expm1(y_train_pred)))

#### テストデータに対する予測

In [None]:
y_test_pred = lgbm.predict(X_test)

In [None]:
y_min = np.append(y_test_pred, y_test).min()
y_max = np.append(y_test_pred, y_test).max()
y_margin = (y_max - y_min) * 0.1
y_min -= y_margin
y_max += y_margin

In [None]:
plt.scatter(y_test_pred, y_test, alpha=0.3)
plt.plot([y_min,y_max], [y_min,y_max])
plt.gca().set_aspect('equal', adjustable='box')
plt.xlim(y_min, y_max)
plt.ylim(y_min, y_max)
plt.title('Test data')
plt.xlabel('Pred')
plt.ylabel('True')
plt.show()

In [None]:
plt.scatter(y_test_pred, y_test, alpha=0.3)
plt.plot([y_min,y_max], [y_min,y_max])
plt.gca().set_aspect('equal', adjustable='box')
plt.xlim(0, 400) # <---
plt.ylim(0, 400) # <---
plt.title('Test data')
plt.xlabel('Pred')
plt.ylabel('True')
plt.show()

In [None]:
# モデルの評価
print("RMSLE for test data:", rmsle(np.expm1(y_test), np.expm1(y_test_pred)))

In [None]:
stop_real = datetime.now()
execution_time_real = stop_real - start_real
print(execution_time_real)