In [1]:
import gc
import time
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb

In [2]:
NUM_BRANDS = 4000
NUM_CATEGORIES = 1000
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 50000

In [3]:
def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='missing', inplace=True)


def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category = dataset['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['category_name'].isin(pop_category), 'category_name'] = 'missing'


def to_categorical(dataset):
    dataset['category_name'] = dataset['category_name'].astype('category')
    dataset['brand_name'] = dataset['brand_name'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')

## Load Data

In [4]:
start_time = time.time()
train=pd.read_table('input/train.tsv', engine='c')
test = pd.read_table('input/test.tsv', engine='c')
print('[{}] Finished to load data'.format(time.time() - start_time))

[10.720398187637329] Finished to load data


In [7]:
mean_price=train.groupby('category_name')['price'].mean()

In [36]:
categorized_price=train.groupby("category_name")['price'].size().reset_index(name="counts")

In [50]:
len(categorized_price[categorized_price.counts>5])

1044

In [58]:
categorized_price.sort_values("counts")

small_group_category=categorized_price[categorized_price.counts<5]['category_name'] #收集卖的不多的种类
small_group_category_list=list(small_group_category)

small_group_category_item=train[train['category_name'].isin(small_group_category_list)]

In [73]:
nocomment=train[train.item_description=='No description yet']

In [78]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [96]:
withcomment.groupby("category_name")['price'].mean().reset_index()

Unnamed: 0,category_name,price
0,Beauty/Bath & Body/Bath,18.340768
1,Beauty/Bath & Body/Bathing Accessories,22.253968
2,Beauty/Bath & Body/Cleansers,13.623547
3,Beauty/Bath & Body/Other,20.950355
4,Beauty/Bath & Body/Scrubs & Body Treatments,24.907945
5,Beauty/Bath & Body/Sets,21.081991
6,Beauty/Fragrance/Candles & Home Scents,21.125784
7,Beauty/Fragrance/Kids,8.519481
8,Beauty/Fragrance/Men,28.532879
9,Beauty/Fragrance/Other,26.555556


In [89]:
tempList[]

category_name
Beauty/Bath & Body/Bath                         -3.441840
Beauty/Bath & Body/Bathing Accessories           7.253968
Beauty/Bath & Body/Cleansers                     1.223547
Beauty/Bath & Body/Other                        11.783688
Beauty/Bath & Body/Scrubs & Body Treatments     11.434260
Beauty/Bath & Body/Sets                         -0.251342
Beauty/Fragrance/Candles & Home Scents           1.009994
Beauty/Fragrance/Kids                            1.219481
Beauty/Fragrance/Men                            -0.794990
Beauty/Fragrance/Other                          22.888889
Beauty/Fragrance/Sets                            8.918523
Beauty/Fragrance/Women                           1.742015
Beauty/Hair Care/Conditioners                    2.951751
Beauty/Hair Care/Hair & Scalp Treatments        -0.675095
Beauty/Hair Care/Hair Color                     -0.241679
Beauty/Hair Care/Hair Loss Products             -5.315134
Beauty/Hair Care/Hair Perms & Texturizers             NaN


In [81]:
withcomment.groupby("category_name")['price'].mean()

category_name
Beauty/Bath & Body/Bath                          18.340768
Beauty/Bath & Body/Bathing Accessories           22.253968
Beauty/Bath & Body/Cleansers                     13.623547
Beauty/Bath & Body/Other                         20.950355
Beauty/Bath & Body/Scrubs & Body Treatments      24.907945
Beauty/Bath & Body/Sets                          21.081991
Beauty/Fragrance/Candles & Home Scents           21.125784
Beauty/Fragrance/Kids                             8.519481
Beauty/Fragrance/Men                             28.532879
Beauty/Fragrance/Other                           26.555556
Beauty/Fragrance/Sets                            25.835189
Beauty/Fragrance/Women                           23.790733
Beauty/Hair Care/Conditioners                    21.220982
Beauty/Hair Care/Hair & Scalp Treatments         18.669732
Beauty/Hair Care/Hair Color                      13.337268
Beauty/Hair Care/Hair Loss Products              23.129310
Beauty/Hair Care/Hair Perms & Texturizers 

In [80]:
nocomment.groupby("category_name")['price'].mean()

category_name
Beauty/Bath & Body/Bath                          21.782609
Beauty/Bath & Body/Bathing Accessories           15.000000
Beauty/Bath & Body/Cleansers                     12.400000
Beauty/Bath & Body/Other                          9.166667
Beauty/Bath & Body/Scrubs & Body Treatments      13.473684
Beauty/Bath & Body/Sets                          21.333333
Beauty/Fragrance/Candles & Home Scents           20.115789
Beauty/Fragrance/Kids                             7.300000
Beauty/Fragrance/Men                             29.327869
Beauty/Fragrance/Other                            3.666667
Beauty/Fragrance/Sets                            16.916667
Beauty/Fragrance/Women                           22.048718
Beauty/Hair Care/Conditioners                    18.269231
Beauty/Hair Care/Hair & Scalp Treatments         19.344828
Beauty/Hair Care/Hair Color                      13.578947
Beauty/Hair Care/Hair Loss Products              28.444444
Beauty/Hair Care/Other                    

In [76]:
withcomment=train[train.item_description!='No description yet']

In [77]:
withcomment.describe()

Unnamed: 0,train_id,item_condition_id,price,shipping
count,1400046.0,1400046.0,1400046.0,1400046.0
mean,741323.8,1.902169,27.02052,0.4495545
std,427939.7,0.9044373,39.04876,0.4974489
min,1.0,1.0,0.0,0.0
25%,370641.2,1.0,10.0,0.0
50%,741366.5,2.0,17.0,0.0
75%,1111921.0,3.0,30.0,1.0
max,1482534.0,5.0,2009.0,1.0


In [71]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [7]:
nrow_train = train.shape[0]
y = np.log1p(train["price"])
merge: pd.DataFrame = pd.concat([train, test])
submission: pd.DataFrame = test[['test_id']]
    
del train
del test
gc.collect()

In [None]:
model = Ridge(solver="sag", fit_intercept=True, random_state=205, alpha=3)
model.fit(X, y)
predsR = model.predict(X=X_test)

The code that ran on the Kaggle platform(so far)

In [None]:
def main():
    start_time = time.time()

    train = pd.read_table('input/train.tsv', engine='c')
    test = pd.read_table('input/test.tsv', engine='c')
    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)

    nrow_train = train.shape[0]
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, test])
    submission: pd.DataFrame = test[['test_id']]

    del train
    del test
    gc.collect()

    handle_missing_inplace(merge)
    print('[{}] Finished to handle missing'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Finished to cut'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Finished to convert categorical'.format(time.time() - start_time))

    cv = CountVectorizer(min_df=NAME_MIN_DF)
    X_name = cv.fit_transform(merge['name'])
    print('[{}] Finished count vectorize `name`'.format(time.time() - start_time))

    cv = CountVectorizer()
    X_category = cv.fit_transform(merge['category_name'])
    print('[{}] Finished count vectorize `category_name`'.format(time.time() - start_time))

    tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                         ngram_range=(1, 3),
                         stop_words='english')
    X_description = tv.fit_transform(merge['item_description'])
    print('[{}] Finished TFIDF vectorize `item_description`'.format(time.time() - start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Finished label binarize `brand_name`'.format(time.time() - start_time))

    X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                          sparse=True).values)
    print('[{}] Finished to get dummies on `item_condition_id` and `shipping`'.format(time.time() - start_time))

    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category, X_name)).tocsr()
    print('[{}] Finished to create sparse merge'.format(time.time() - start_time))

    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_train:]

    # def rmsle(y, y0):
    #     assert len(y) == len(y0)
    #     return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))
    
    model = Ridge(solver="sag", fit_intercept=True, random_state=205, alpha=3)
    model.fit(X, y)
    print('[{}] Finished to train ridge sag'.format(time.time() - start_time))
    predsR = model.predict(X=X_test)
    print('[{}] Finished to predict ridge sag'.format(time.time() - start_time))

    model = Ridge(solver="lsqr", fit_intercept=True, random_state=145, alpha = 3)
    model.fit(X, y)
    print('[{}] Finished to train ridge lsqrt'.format(time.time() - start_time))
    predsR2 = model.predict(X=X_test)
    print('[{}] Finished to predict ridge lsqrt'.format(time.time() - start_time))

    train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state = 144) 
    d_train = lgb.Dataset(train_X, label=train_y, max_bin=8192)
    d_valid = lgb.Dataset(valid_X, label=valid_y, max_bin=8192)
    watchlist = [d_train, d_valid]
    
    params = {
        'learning_rate': 0.76,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 99,
        'verbosity': -1,
        'metric': 'RMSE',
        'nthread': 4
    }

    params2 = {
        'learning_rate': 0.85,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 110,
        'verbosity': -1,
        'metric': 'RMSE',
        'nthread': 4
    }

    model = lgb.train(params, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \
    early_stopping_rounds=500, verbose_eval=500) 
    predsL = model.predict(X_test)
    
    print('[{}] Finished to predict lgb 1'.format(time.time() - start_time))
    
    train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X, y, test_size = 0.1, random_state = 101) 
    d_train2 = lgb.Dataset(train_X2, label=train_y2, max_bin=8192)
    d_valid2 = lgb.Dataset(valid_X2, label=valid_y2, max_bin=8192)
    watchlist2 = [d_train2, d_valid2]

    model = lgb.train(params2, train_set=d_train2, num_boost_round=3000, valid_sets=watchlist2, \
    early_stopping_rounds=50, verbose_eval=500) 
    predsL2 = model.predict(X_test)

    print('[{}] Finished to predict lgb 2'.format(time.time() - start_time))

    preds = predsR2*0.15 + predsR*0.15 + predsL*0.5 + predsL2*0.2

    submission['price'] = np.expm1(preds)
    submission.to_csv("submission_lgbm_ridge_8.csv", index=False)

if __name__ == '__main__':
    main()