In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
from sklearn.model_selection import train_test_split
pd.set_option('display.float_format', lambda x:'%.5f' % x)
import numpy as np

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org
  ipython-dev@scipy.org""")
  from numpy.core.umath_tests import inner1d


In [2]:
# データタイプを指定
types_dict_train = {'train_id':'int64', 'item_condition_id':'int8', 'price':'float64', 'shipping':'int8'}
types_dict_test = {'test_id':'int64', 'item_condition_id':'int8', 'shipping':'int8'}
# tsvファイルからPandas DataFrameへ読み込み
train = pd.read_csv('./input/train.tsv', delimiter='\t', low_memory=True, dtype=types_dict_train)
test = pd.read_csv('./input/test.tsv', delimiter='\t', low_memory=True, dtype=types_dict_test)


In [3]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000):
        with pd.option_context("display.max_columns", 1000):
            display(df)
# trainの基本統計量を表示
display_all(train.describe(include='all').transpose())

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
train_id,1482535.0,,,,741267.0,427971.135,0.0,370633.5,741267.0,1111900.5,1482534.0
name,1482535.0,1225273.0,Bundle,2232.0,,,,,,,
item_condition_id,1482535.0,,,,1.90738,0.90316,1.0,1.0,2.0,3.0,5.0
category_name,1476208.0,1287.0,"Women/Athletic Apparel/Pants, Tights, Leggings",60177.0,,,,,,,
brand_name,849853.0,4809.0,PINK,54088.0,,,,,,,
price,1482535.0,,,,26.73752,38.58607,0.0,10.0,17.0,29.0,2009.0
shipping,1482535.0,,,,0.44727,0.49721,0.0,0.0,0.0,1.0,1.0
item_description,1482531.0,1281426.0,No description yet,82489.0,,,,,,,


In [4]:
# train, test のカテゴリ名、商品説明、投稿タイトル、ブランド名のデータタイプを「category」へ変換する
for df in train, test:
    df.category_name = df.category_name.astype('category')
    df.item_description = df.item_description.astype('category')
    df.name = df.name.astype('category')
    df.brand_name = df.brand_name.astype('category')

# dtypesで念のためデータ形式を確認しましょう
train.dtypes, test.dtypes

(train_id                int64
 name                 category
 item_condition_id        int8
 category_name        category
 brand_name           category
 price                 float64
 shipping                 int8
 item_description     category
 dtype: object, test_id                 int64
 name                 category
 item_condition_id        int8
 category_name        category
 brand_name           category
 shipping                 int8
 item_description     category
 dtype: object)

In [5]:
# trainの中のユニークな値を確認する
train.apply(lambda x: x.nunique())
# testの中のユニークな値を確認する
test.apply(lambda x: x.nunique())

test_id              693359
name                 601117
item_condition_id         5
category_name          1223
brand_name             3900
shipping                  2
item_description     609555
dtype: int64

In [6]:
# trainの欠損データの個数と%を確認
train.isnull().sum(),train.isnull().sum()/train.shape[0]
# testの欠損データの個数と%を確認
test.isnull().sum(),test.isnull().sum()/test.shape[0]

(test_id                   0
 name                      0
 item_condition_id         0
 category_name          3058
 brand_name           295525
 shipping                  0
 item_description          0
 dtype: int64, test_id             0.00000
 name                0.00000
 item_condition_id   0.00000
 category_name       0.00441
 brand_name          0.42622
 shipping            0.00000
 item_description    0.00000
 dtype: float64)

In [7]:
# train, test dataの文字列を「.cat.codes」で数値へ変換する
for df in train, test:
    df.name = df.name.cat.codes
    df.category_name = df.category_name.cat.codes
    df.brand_name = df.brand_name.cat.codes
    df.item_description = df.item_description.cat.codes
# データの中身とデータ形式を表示して確認しましょう
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,640809,3,808,-1,10.0,1,806610
1,1,903932,3,86,3557,52.0,0,1090878
2,2,91532,1,1254,4180,10.0,1,115289
3,3,561143,1,485,-1,35.0,1,782305
4,4,45483,1,1181,-1,44.0,0,366652


In [8]:
train.dtypes

train_id               int64
name                   int32
item_condition_id       int8
category_name          int16
brand_name             int16
price                float64
shipping                int8
item_description       int32
dtype: object

In [9]:
# price（価格）をlog関数で処理
train['price'] = train['price'].apply(lambda x: np.log(x) if x>0 else x)
# trainを表示して確認
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,640809,3,808,-1,2.30259,1,806610
1,1,903932,3,86,3557,3.95124,0,1090878
2,2,91532,1,1254,4180,2.30259,1,115289
3,3,561143,1,485,-1,3.55535,1,782305
4,4,45483,1,1181,-1,3.78419,0,366652


In [10]:
# x ＝ price以外の全ての値、y = price（ターゲット）で切り分ける
x_train, y_train = train.drop(['price'], axis=1), train.price
# モデルの作成
m = RandomForestRegressor(n_jobs=-1, min_samples_leaf=5, n_estimators=200)
m.fit(x_train, y_train)
# スコアを表示
m.score(x_train, y_train)

0.7401606513428847

In [12]:
# 作成したランダムフォレストのモデル「m」に「test」を入れて予測する
preds = m.predict(test)
# 予測値 predsをnp.exp()で処理
np.exp(preds)
# Numpy配列からpandasシリーズへ変換
preds = pd.Series(np.exp(preds))
# テストデータのIDと予測値を連結
submit = pd.concat([test.test_id, preds], axis=1)
# カラム名をメルカリの提出指定の名前をつける
submit.columns = ['test_id', 'price']
# 提出ファイルとしてCSVへ書き出し
submit.to_csv('submit_rf_base.csv', index=False)