In [1]:
import pandas as pd
import numpy as np

#### 拡張子tsv.7zをコマンドプロンプトで解凍する必要がある
(7zを解凍する実行ファイルパス) e (解凍したい.tsv.7zのファイルパス)

"C:\Program Files\7-Zip\7z.exe" e C:\Users\1612h\Kaggle_PG\mercari-price-suggestion-challenge\train.tsv.7z

"C:\Program Files\7-Zip\7z.exe" e C:\Users\1612h\Kaggle_PG\mercari-price-suggestion-challenge\test.tsv.7z

"C:\Program Files\7-Zip\7z.exe" e C:\Users\1612h\Kaggle_PG\mercari-price-suggestion-challenge\sample_submission.csv.7z

## データの読み込み
delimiterは区切り文字の指定 tsvはタブ区切りだから\t

low_memoryはメモリの使用量を減らす

In [3]:
train = pd.read_csv("train.tsv", delimiter='\t', low_memory=True)
test = pd.read_csv("test.tsv", delimiter='\t', low_memory=True)

## 前処理・特徴量エンジニアリング
train：1,482,535個のユーザーが投稿した商品

test：693,359行で「価格（Price）」の項目がテストデータは含まれていないため、列数は「７」となっている 

* train_id / test _id – ユーザー投稿のID
* name – 投稿のタイトル。タイトルに価格に関する情報がある場合（例：$20）はメルカリが事前に削除をして[rm]と置き換えています。
* item_condition_id – ユーザーが指定した商品の状態
* category_name – 投稿カテゴリー
* brand_name – ブランドの名前
* price – 訓練データのみ。実際に売られた価格。米ドル表示。今回のチャレンジの予測ターゲットとなります。
* shipping – 送料のフラグ。「1」は販売者負担。「0」は購入者負担。
* item_description – ユーザーが投稿した商品説明の全文。タイトルと同様に価格情報がある場合は[rm]と置き換えられています。

In [4]:
print(train.shape)
train.head()

(1482535, 8)


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [5]:
print(test.shape)
test.head()

(693359, 7)


Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


### trainの統計量を表示　transposeで行と列を入れ替え

* train_id：uniqueが0で重複無し
* name：uniqueが多く、商品名が重複している
* brand_name：nameと同じで半分ほど重複している
* price：mean(平均値)が26.7ドル　最小値が0ドル、最大値が2009ドル
* item_description：top(最頻値)はNo description yet(商品説明なし)が82,489個で全体の5.6%

In [6]:
train.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
train_id,1482535.0,,,,741267.0,427971.135004,0.0,370633.5,741267.0,1111900.5,1482534.0
name,1482535.0,1225273.0,Bundle,2232.0,,,,,,,
item_condition_id,1482535.0,,,,1.90738,0.903159,1.0,1.0,2.0,3.0,5.0
category_name,1476208.0,1287.0,"Women/Athletic Apparel/Pants, Tights, Leggings",60177.0,,,,,,,
brand_name,849853.0,4809.0,PINK,54088.0,,,,,,,
price,1482535.0,,,,26.737516,38.586066,0.0,10.0,17.0,29.0,2009.0
shipping,1482535.0,,,,0.447274,0.497212,0.0,0.0,0.0,1.0,1.0
item_description,1482529.0,1281425.0,No description yet,82489.0,,,,,,,


### 機械学習では学習のために文字列(object)ではなく、数値に変換する

In [6]:
train.dtypes

train_id               int64
name                  object
item_condition_id      int64
category_name         object
brand_name            object
price                float64
shipping               int64
item_description      object
dtype: object

name, category_name, brand_name, item_descriptionをカテゴリに変換する

In [7]:
#trainデータ
train.name = train.name.astype("category")
train.category_name = train.category_name.astype("category")
train.brand_name = train.brand_name.astype("category")
train.item_description = train.item_description.astype("category")

#testデータ
test.name = test.name.astype("category")
test.category_name = test.category_name.astype("category")
test.brand_name = test.brand_name.astype("category")
test.item_description = test.item_description.astype("category")

objectがcategoryに変換されたか確認

In [8]:
train.dtypes

train_id                int64
name                 category
item_condition_id       int64
category_name        category
brand_name           category
price                 float64
shipping                int64
item_description     category
dtype: object

### unique(固有の値)・欠損値を確認

category_name, brand_nameが固有の値が少なく、重複が多い

In [9]:
train.apply(lambda x: x.nunique())

train_id             1482535
name                 1225273
item_condition_id          5
category_name           1287
brand_name              4809
price                    828
shipping                   2
item_description     1281425
dtype: int64

category_name, brand_nameの欠損値が多い

In [10]:
train.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          6
dtype: int64

category_nameの欠損値の割合：0.43％

brand_nameの欠損値の割合：42.7％  特にこっちの欠損値を考慮するべき！

In [11]:
#category_nameの欠損値の割合(%)
train.category_name.isnull().sum() / train.shape[0] * 100

0.42676901388500105

In [12]:
train.brand_name.isnull().sum() / train.shape[0] * 100

42.675687251902986

## train,testを結合してまとめて処理する

trainにはprice列がありtestにはないため、price列を削除してから結合

In [13]:
train_test_combine = pd.concat([train.drop(["price"],axis=1), test], axis=0) #axis=0で行、axis=1で列に結合　デフォルトはaxis=0

In [14]:
train_test_combine.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,shipping,item_description,test_id
0,0.0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,1,No description yet,
1,1.0,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,This keyboard is in great condition and works ...,
2,2.0,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,Adorable top with a hint of lace and a key hol...,
3,3.0,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,1,New with tags. Leather horses. Retail for [rm]...,
4,4.0,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,0,Complete with certificate of authenticity,


train：1,482,535個の商品

test：693,359個の商品

train_test_combine：2,175,894個の商品

In [15]:
train_test_combine.shape

(2175894, 8)

In [16]:
train_test_combine.dtypes


train_id             float64
name                  object
item_condition_id      int64
category_name         object
brand_name            object
shipping               int64
item_description      object
test_id              float64
dtype: object

categoryに変換したはずだが、objectに戻っていたため、categoryに変換する

In [17]:
train_test_combine.name = train_test_combine.name.astype("category")
train_test_combine.category_name = train_test_combine.category_name.astype("category")
train_test_combine.brand_name = train_test_combine.brand_name.astype("category")
train_test_combine.item_description = train_test_combine.item_description.astype("category")

categoryに変換されたか確認

In [18]:
train_test_combine.dtypes

train_id              float64
name                 category
item_condition_id       int64
category_name        category
brand_name           category
shipping                int64
item_description     category
test_id               float64
dtype: object

In [19]:
train_test_combine.train_id = train_test_combine.train_id.fillna(pd.Series(train_test_combine.index))
train_test_combine.test_id = train_test_combine.test_id.fillna(pd.Series(train_test_combine.index))

train_test_combine.train_id = train_test_combine.train_id.astype(np.int64)
train_test_combine.test_id = train_test_combine.test_id.astype(np.int64)

train_test_combine.name = train_test_combine.name.cat.codes
train_test_combine.category_name = train_test_combine.category_name.cat.codes
train_test_combine.brand_name = train_test_combine.brand_name.cat.codes
train_test_combine.item_description = train_test_combine.item_description.cat.codes

print('前処理完了"')

df_train = train_test_combine.iloc[:train.shape[0],:]
df_test = train_test_combine.iloc[train.shape[0]:,:]

# #df_trainでtest_idを削除
# df_train = df_train.drop(["test_id"], axis=1)
# #df_testでtrain_idを削除
# df_test = df_test.drop(["train_id"], axis=1)

# df_test = df_test[["test_id"] + [col for col in df_test.columns if col != "test_id"]]

df_train["price"] = train.price
print("学習開始")

前処理完了"
学習開始


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["price"] = train.price


In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_valid, y_train, y_valid = train_test_split(df_train.drop(["price"], axis=1), df_train.price, test_size=0.2, random_state=42)

# clf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
# clf.fit(X_train, y_train)

# print(clf.score(X_train, y_train))

# 保存したモデルをロードする
import pickle
loaded_model = pickle.load(open("model_randomforest.pkl", 'rb'))
result = loaded_model.score(X_train, y_train)
print(result)

# 作成したランダムフォレストのモデル「m」に「df_test」を入れて予測する
preds = loaded_model.predict(df_test)
# 予測値 predsをnp.exp()で処理
np.exp(preds)
# Numpy配列からpandasシリーズへ変換
preds = pd.Series(np.exp(preds))
# テストデータのIDと予測値を連結
submit = pd.concat([df_test.id, preds], axis=1)
# カラム名をメルカリの提出指定の名前をつける
submit.columns = ['test_id', 'price']
# 提出ファイルとしてCSVへ書き出し
submit.to_csv('submit_rf_base.csv', index=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


ValueError: node array from the pickle has an incompatible dtype:
- expected: {'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity', 'n_node_samples', 'weighted_n_node_samples', 'missing_go_to_left'], 'formats': ['<i8', '<i8', '<i8', '<f8', '<f8', '<i8', '<f8', 'u1'], 'offsets': [0, 8, 16, 24, 32, 40, 48, 56], 'itemsize': 64}
- got     : [('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]