In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import gc
import time
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb

NUM_BRANDS = 4004
NUM_CATEGORIES = 1001
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 3

In [19]:
!pip install lightgbm

Collecting lightgbm
[?25l  Downloading https://files.pythonhosted.org/packages/78/7e/bc87e7951cfaa998cffaf39e6c721f5bd04efb2e139486206356edb289a5/lightgbm-2.2.1-py2.py3-none-manylinux1_x86_64.whl (1.1MB)
[K    100% |████████████████████████████████| 1.1MB 7.1MB/s 
Installing collected packages: lightgbm
Successfully installed lightgbm-2.2.1


In [0]:
from contextlib import contextmanager

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

In [0]:
import math

def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

In [0]:
  train = pd.read_table('../input/train.tsv', engine='c')
  test = pd.read_table('../input/test.tsv', engine='c')
  print('[{}] Finished to load data'.format(time.time() - start_time))
  print('Train shape: ', train.shape)
  print('Test shape: ', test.shape)

  nrow_train = train.shape[0]
  y = np.log1p(train["price"])
  merge: pd.DataFrame = pd.concat([train, test])
  submission: pd.DataFrame = test[['test_id']]

  del train
  del test
  gc.collect()

In [0]:
'''
import zipfile

with zipfile.ZipFile('drive/My Drive/Kaggle/mercari/input/train.zip') as existing_zip:
  existing_zip.extractall('drive/My Drive/Kaggle/mercari/input')
'''  

In [6]:
with timer('load data'):
  train = pd.read_table('drive/My Drive/Kaggle/mercari/input/train.tsv', engine='c')
  test = pd.read_table('drive/My Drive/Kaggle/mercari/input/test.tsv', engine='c')

print('Train shape: ', train.shape)
print('Test shape: ', test.shape)

nrow_train = train.shape[0]
y = np.log1p(train["price"])
merge: pd.DataFrame = pd.concat([train, test])
submission: pd.DataFrame = test[['test_id']]

del train
del test
gc.collect()

[load data] done in 13 s
Train shape:  (1482535, 8)
Test shape:  (693359, 7)


25

In [8]:
with timer('handle_missing_inplace'):
  merge['category_name'].fillna(value='missing', inplace=True)
  merge['brand_name'].fillna(value='missing', inplace=True)
  merge['item_description'].fillna(value='missing', inplace=True)

[handle_missing_inplace] done in 1 s


In [9]:
with timer('cutting'):
  pop_brand = merge['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
  merge.loc[~merge['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
  pop_category = merge['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
  merge.loc[~merge['category_name'].isin(pop_category), 'category_name'] = 'missing'

[cutting] done in 1 s


In [10]:
with timer('to category'):
  merge['category_name'] = merge['category_name'].astype('category')
  merge['brand_name'] = merge['brand_name'].astype('category')
  merge['item_condition_id'] = merge['item_condition_id'].astype('category')

[to category] done in 1 s


In [11]:
with timer('count vectorize `name`') :
  cv = CountVectorizer(min_df=NAME_MIN_DF)
  X_name = cv.fit_transform(merge['name'])

[count vectorize `name`] done in 13 s


In [12]:
with timer('count vectorize `category_name`'):
  cv = CountVectorizer()
  X_category = cv.fit_transform(merge['category_name'])

[count vectorize `category_name`] done in 11 s


In [13]:
with timer('TFIDF vectorize `item_description`'):
  tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                       ngram_range=(1, 3),
                       stop_words='english')
  X_description = tv.fit_transform(merge['item_description'])

[TFIDF vectorize `item_description`] done in 271 s


In [14]:
with timer('label binarize `brand_name`'):
  lb = LabelBinarizer(sparse_output=True)
  X_brand = lb.fit_transform(merge['brand_name'])

[label binarize `brand_name`] done in 153 s


In [15]:
with timer('dummies on `item_condition_id` and `shipping`'):
  X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                        sparse=True).values)

[dummies on `item_condition_id` and `shipping`] done in 5 s


In [16]:
with timer('create sparse merge'):
  sparse_merge = hstack((X_dummies, X_description, X_brand, X_category, X_name)).tocsr()

[create sparse merge] done in 17 s


In [0]:
X = sparse_merge[:nrow_train]
X_valid = sparse_merge[nrow_train:]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =0)

### LightGBM2

In [0]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [0]:
params_lgb2 = {
        'learning_rate': 0.75,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 100,
        'verbosity': -1,
        'metric': 'RMSE',
    }

In [37]:
with timer('LightGBM Model (2) trainning'):
  my_lgb = lgb.train(params_lgb2, lgb_train, valid_sets=lgb_eval)
  #my_lgb = lgb.train(params_lgb2, train_set=lgb_train, num_boost_round=3200, verbose_eval=100)

[1]	valid_0's rmse: 0.69929
[2]	valid_0's rmse: 0.676095
[3]	valid_0's rmse: 0.666899
[4]	valid_0's rmse: 0.657889
[5]	valid_0's rmse: 0.65031
[6]	valid_0's rmse: 0.64461
[7]	valid_0's rmse: 0.640748
[8]	valid_0's rmse: 0.636389
[9]	valid_0's rmse: 0.633213
[10]	valid_0's rmse: 0.629421
[11]	valid_0's rmse: 0.626785
[12]	valid_0's rmse: 0.623969
[13]	valid_0's rmse: 0.621393
[14]	valid_0's rmse: 0.6191
[15]	valid_0's rmse: 0.61698
[16]	valid_0's rmse: 0.615056
[17]	valid_0's rmse: 0.612732
[18]	valid_0's rmse: 0.61074
[19]	valid_0's rmse: 0.609138
[20]	valid_0's rmse: 0.60756
[21]	valid_0's rmse: 0.605642
[22]	valid_0's rmse: 0.604147
[23]	valid_0's rmse: 0.60285
[24]	valid_0's rmse: 0.600571
[25]	valid_0's rmse: 0.59925
[26]	valid_0's rmse: 0.597954
[27]	valid_0's rmse: 0.596742
[28]	valid_0's rmse: 0.595562
[29]	valid_0's rmse: 0.5943
[30]	valid_0's rmse: 0.593114
[31]	valid_0's rmse: 0.592196
[32]	valid_0's rmse: 0.591019
[33]	valid_0's rmse: 0.589964
[34]	valid_0's rmse: 0.589046
[

In [0]:
# テストデータを予測する
y_pred_gbm = my_lgb.predict(X_test, num_iteration=my_lgb.best_iteration)
y_pred_gbm = np.expm1(y_pred_gbm)

In [0]:
# y_true
y_true = y_test.values
y_true = np.expm1(y_true)

In [49]:
# RMSLE
rmsle_gbm = rmsle(y_true, y_pred_gbm)	
print('LightGBM (2)_RMSLE: {}'.format(rmsle_gbm))

LightGBM (2)_RMSLE: 0.5492494669054053


### LightGBM3

In [51]:
with timer('LightGBM Model (3) trainning'):
  #my_lgb = lgb.train(params_lgb2, lgb_train, valid_sets=lgb_eval)
  my_lgb3 = lgb.train(params_lgb2, train_set=lgb_train, num_boost_round=3200, verbose_eval=100)

[LightGBM Model (3) trainning] done in 331 s


In [52]:
# テストデータを予測する
y_pred_gbm3 = my_lgb3.predict(X_test, num_iteration=my_lgb3.best_iteration)
y_pred_gbm3 = np.expm1(y_pred_gbm3)

# RMSLE
rmsle_gbm3 = rmsle(y_true, y_pred_gbm3)	
print('LightGBM (3)_RMSLE: {}'.format(rmsle_gbm3))

LightGBM (3)_RMSLE: 0.4720880439739264


### Ridge regression

In [53]:
with timer('Ridge model training'):
  my_ridge = Ridge(solver="sag", fit_intercept=True, random_state=205)
  my_ridge.fit(X_train, y_train)

[Ridge model training] done in 106 s


In [54]:
# テストデータを予測する
y_pred_ridge = my_ridge.predict(X_test)
y_pred_ridge = np.expm1(y_pred_ridge)

# RMSLE
rmsle_ridge = rmsle(y_true, y_pred_ridge)	
print('Ridge Regression RMSLE: {}'.format(rmsle_ridge))

Ridge Regression RMSLE: 0.49699035105364286


### Ensemble

In [0]:
# テストデータを予測する
y_pred_gbm_ridge = 0.57*my_lgb3.predict(X_test)
y_pred_gbm_ridge += 0.43*my_ridge.predict(X=X_test)
y_pred_gbm_ridge = np.expm1(y_pred_gbm_ridge)

In [58]:
# RMSLE
rmsle_gbm_ridge = rmsle(y_true, y_pred_gbm_ridge)	
print('LightGBM + Ridge Regression RMSLE: {}'.format(rmsle_gbm_ridge))

LightGBM + Ridge Regression RMSLE: 0.4712881698964148


In [59]:
# Test
# テストデータを予測する
y_pred_gbm_ridge2 = 0.55*my_lgb3.predict(X_test)
y_pred_gbm_ridge2 += 0.45*my_ridge.predict(X=X_test)
y_pred_gbm_ridge2 = np.expm1(y_pred_gbm_ridge2)
# RMSLE
rmsle_gbm_ridge2 = rmsle(y_true, y_pred_gbm_ridge2)	
print('LightGBM + Ridge Regression RMSLE: {}'.format(rmsle_gbm_ridge2))

LightGBM + Ridge Regression RMSLE: 0.47168441398621935
