In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
import re
import time

from subprocess import check_output
#print(check_output(["ls", "data"]).decode("utf8"))
import os

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.base import TransformerMixin
start_time = time.time()

In [7]:
train=pd.read_csv('data/train.tsv', sep='\t', encoding='utf-8')
test=pd.read_csv('data/test.tsv', sep='\t', encoding='utf-8')
sample = pd.read_csv('data/sample_submission.csv', sep='\t', encoding='utf-8')

In [8]:
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X], index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [9]:
# price to logprice + 1
train["logprice"] = np.log(train["price"]+1)

# Push "Other/Other/Other" into NaN category name
train.loc[train["category_name"].isnull(), ["category_name"]] = "Other/Other/Other" #Merge into others
test.loc[test["category_name"].isnull(), ["category_name"]] = "Other/Other/Other" #Merge into others

# make 1st / 2nd level category label
train["1st_category"] = train["category_name"].str.extract('([^/]+)/[^/]+/[^/]+')
train["2nd_category"] = train["category_name"].str.extract('([^/]+/[^/]+)/[^/]+')
test["1st_category"] = test["category_name"].str.extract('([^/]+)/[^/]+/[^/]+')
test["2nd_category"] = test["category_name"].str.extract('([^/]+/[^/]+)/[^/]+')

In [10]:
feature_columns_to_use = ['item_condition_id','2nd_category', 'brand_name','shipping']
nonnumeric_columns = ['2nd_category', 'brand_name']

In [11]:
# combine for NaN filling
big_X = train[feature_columns_to_use].append(test[feature_columns_to_use])
big_X_imputed = DataFrameImputer().fit_transform(big_X)

In [12]:
le = LabelEncoder()
for feature in nonnumeric_columns:
    big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])

big_X_imputed.head(3)

Unnamed: 0,item_condition_id,2nd_category,brand_name,shipping
0,3,87,3337,1
1,3,10,3889,0
2,1,134,4588,1


In [13]:
big_X_imputed.__class__ # pandas.core.frame.DataFrame

trainX = big_X_imputed[0:train.shape[0]]
testX = big_X_imputed[train.shape[0]::]

trainX["price"] = train["price"]

trainX, validX = np.split(trainX.sample(frac=1), [int(.75*trainX.shape[0])])
c_ignors = ['price', 'train']
col = [c for c in trainX.columns if c not in c_ignors]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
dtrain = xgb.DMatrix(trainX[col], trainX['price'])
dvalid  = xgb.DMatrix(validX[col],  validX['price'])

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [15]:
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
params = {'min_child_weight': 20, 'eta': 0.015, 'colsample_bytree': 0.48, 'max_depth': 14,
            'subsample': 0.91, 'lambda': 2.01, 'nthread': 4, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear','tree_method': 'hist'}
model = xgb.train(params, dtrain, 1000, watchlist, verbose_eval=10, early_stopping_rounds=20)


[0]	train-rmse:46.4184	valid-rmse:46.4967
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 20 rounds.
[10]	train-rmse:44.2266	valid-rmse:44.3221
[20]	train-rmse:42.5982	valid-rmse:42.7101
[30]	train-rmse:41.2605	valid-rmse:41.3859
[40]	train-rmse:40.1368	valid-rmse:40.2747
[50]	train-rmse:39.3646	valid-rmse:39.5126
[60]	train-rmse:38.7419	valid-rmse:38.8964
[70]	train-rmse:38.1534	valid-rmse:38.3181
[80]	train-rmse:37.7173	valid-rmse:37.8894
[90]	train-rmse:37.4269	valid-rmse:37.6045
[100]	train-rmse:37.1377	valid-rmse:37.3212
[110]	train-rmse:36.9002	valid-rmse:37.0893
[120]	train-rmse:36.6925	valid-rmse:36.8868
[130]	train-rmse:36.5236	valid-rmse:36.7231
[140]	train-rmse:36.335	valid-rmse:36.5405
[150]	train-rmse:36.239	valid-rmse:36.447
[160]	train-rmse:36.0935	valid-rmse:36.3068
[170]	train-rmse:36.0088	valid-rmse:36.2243
[180]	train-rmse:35.8966	valid-rmse:36.1165
[190]	train-rmse:35.803	valid-rms

In [16]:
#submission = pd.DataFrame({ 'test_id': test['test_id'], 'price': predictions })
#submission.to_csv("submission.csv", index=False)

In [17]:
test['price'] = model.predict(xgb.DMatrix(testX[col]), ntree_limit=model.best_ntree_limit)
test.loc[test['price'] < 0, 'price'] = 0
test['test_id'] = test['test_id'].astype(int)
test[['test_id', 'price']].to_csv("output.csv", index = False)
print("Finished ...")
tt = (time.time() - start_time)/60
print("Total time %s min" % tt)

Finished ...
Total time 8.689324649175008 min
