In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install lightgbm



In [0]:
import gc
import time
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb

NUM_BRANDS = 4004
NUM_CATEGORIES = 1001
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 3

In [0]:
from contextlib import contextmanager

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

In [0]:
import math

def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

In [6]:
'''
import zipfile

with zipfile.ZipFile('drive/My Drive/Kaggle/mercari/input/train.zip') as existing_zip:
  existing_zip.extractall('drive/My Drive/Kaggle/mercari/input')
'''  

"\nimport zipfile\n\nwith zipfile.ZipFile('drive/My Drive/Kaggle/mercari/input/train.zip') as existing_zip:\n  existing_zip.extractall('drive/My Drive/Kaggle/mercari/input')\n"

In [7]:
with timer('load data'):
  train = pd.read_table('drive/My Drive/Kaggle/mercari/input/train.tsv', engine='c')
  test = pd.read_table('drive/My Drive/Kaggle/mercari/input/test.tsv', engine='c')

print('Train shape: ', train.shape)
print('Test shape: ', test.shape)

nrow_train = train.shape[0]
y = np.log1p(train["price"])
merge: pd.DataFrame = pd.concat([train, test])
submission: pd.DataFrame = test[['test_id']]

del train
del test
gc.collect()

[load data] done in 18 s
Train shape:  (1482535, 8)
Test shape:  (693359, 7)


25

In [8]:
with timer('handle_missing_inplace'):
  merge['category_name'].fillna(value='missing', inplace=True)
  merge['brand_name'].fillna(value='missing', inplace=True)
  merge['item_description'].fillna(value='missing', inplace=True)

[handle_missing_inplace] done in 1 s


In [9]:
with timer('cutting'):
  pop_brand = merge['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
  merge.loc[~merge['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
  pop_category = merge['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
  merge.loc[~merge['category_name'].isin(pop_category), 'category_name'] = 'missing'

[cutting] done in 1 s


In [10]:
with timer('to category'):
  merge['category_name'] = merge['category_name'].astype('category')
  merge['brand_name'] = merge['brand_name'].astype('category')
  merge['item_condition_id'] = merge['item_condition_id'].astype('category')

[to category] done in 1 s


In [11]:
with timer('count vectorize `name`') :
  cv = CountVectorizer(min_df=NAME_MIN_DF)
  X_name = cv.fit_transform(merge['name'])

[count vectorize `name`] done in 14 s


In [12]:
with timer('count vectorize `category_name`'):
  cv = CountVectorizer()
  X_category = cv.fit_transform(merge['category_name'])

[count vectorize `category_name`] done in 12 s


In [13]:
with timer('TFIDF vectorize `item_description`'):
  tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                       ngram_range=(1, 3),
                       stop_words='english')
  X_description = tv.fit_transform(merge['item_description'])

[TFIDF vectorize `item_description`] done in 323 s


In [14]:
with timer('label binarize `brand_name`'):
  lb = LabelBinarizer(sparse_output=True)
  X_brand = lb.fit_transform(merge['brand_name'])

[label binarize `brand_name`] done in 152 s


In [15]:
with timer('dummies on `item_condition_id` and `shipping`'):
  X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                        sparse=True).values)

[dummies on `item_condition_id` and `shipping`] done in 5 s


In [16]:
with timer('create sparse merge'):
  sparse_merge = hstack((X_dummies, X_description, X_brand, X_category, X_name)).tocsr()

[create sparse merge] done in 19 s


In [0]:
X = sparse_merge[:nrow_train]
X_valid = sparse_merge[nrow_train:]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =0)

In [0]:
# y_true
y_true = y_test.values
y_true = np.expm1(y_true)

In [20]:
!pip install fastFM

Collecting fastFM
[?25l  Downloading https://files.pythonhosted.org/packages/f5/15/fdbb9b9455efa48ffb07b9880a1e567e0c7a7de0acc4aa7f1c5ba9ce4f2c/fastFM-0.2.11-cp36-cp36m-manylinux1_x86_64.whl (483kB)
[K    100% |████████████████████████████████| 491kB 6.4MB/s 
Collecting cython (from fastFM)
[?25l  Downloading https://files.pythonhosted.org/packages/64/3f/cac281f3f019b825bbc03fa8cb7eb03d9c355f4aa9eef978279a4966cb21/Cython-0.29-cp36-cp36m-manylinux1_x86_64.whl (2.1MB)
[K    100% |████████████████████████████████| 2.1MB 6.9MB/s 
[?25hInstalling collected packages: cython, fastFM
Successfully installed cython-0.29 fastFM-0.2.11


In [0]:
from fastFM.als import FMRegression
import random

In [24]:
with timer('FMRegression'):
  my_fm = FMRegression(init_stdev=0.0001, rank=128, l2_reg_w=20, l2_reg_V=400,
                                      n_iter=7, random_state=random.randint(0, 1000))
  my_fm.fit(X_train, y_train)

[FMRegression] done in 1109 s


In [27]:
# テストデータを予測する
y_pred_fm = my_fm.predict(X_test)
y_pred_fm = np.expm1(y_pred_fm)

# RMSLE
rmsle_fm = rmsle(y_true, y_pred_fm)	
print('Ridge Regression RMSLE: {}'.format(rmsle_fm))

Ridge Regression RMSLE: 0.46696661250056853


In [28]:
dtrain, dvalid = train_test_split(X, random_state=666, train_size=0.99)
print(dtrain.shape)
print(dvalid.shape)



(1467709, 26293)
(14826, 26293)


In [32]:
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout, Activation, Input

Using TensorFlow backend.


In [0]:
def define_model(data, nodes1, nodes2, drop1, drop2):
    x = Input(shape = (data.shape[1], ), dtype = 'float32', sparse = True)     
    d1 = Dense(nodes1, activation='relu')(x)
    d2 = Dropout(drop1)(d1)
    d3 = Dense(nodes2, activation='sigmoid')(d2)
    d4 = Dropout(drop2)(d3)
    out= Dense(1, activation = 'linear')(d4)
    model = Model(x,out)
    return model

In [0]:
nodes1 = 64
nodes2 = 32
drop1 =  0.30
drop2 =  0.25

print("Training Model...")
odel = define_model(X_train, nodes1, nodes2, drop1, drop2)
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=3, verbose=1, mode='auto')
model.compile(loss='mean_squared_error', optimizer='adam')

model.fit(x=X_train, y=y_train,
          batch_size=600,
          callbacks=[monitor],
          validation_data=(x_val, y_val),
          epochs=10, verbose=0)
    
tpoint2 = time.time()
print("Time Training: {}".format(hms_string(tpoint2-tpoint1)))
    
pred = model.predict(x=X_test, batch_size=8000, verbose=0)


tpoint3 = time.time()
print("Time for Predicting: {}".format(hms_string(tpoint3-tpoint2)))

df_test["price"] = np.expm1(pred)
df_test[["test_id", "price"]].to_csv("submission_NN.csv", index = False)

elapsed_time = time.time() - start_time
print("Total Time: {}".format(hms_string(elapsed_time)))