## 关于Mercari Price Suggestion Challenge比赛的总结（二）

#### 这篇主要是利用FTRL+FM+LGB model来做predict，用到一个[wordbatch](https://github.com/anttttti/Wordbatch)的包(Parallel text feature extraction for machine learning.)，最后的public score是0.42450

### 准备工作

#### 导入相关包

In [1]:
import gc
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import lightgbm as lgb

import sys
import wordbatch

from wordbatch.extractors import WordBag, WordHash
from wordbatch.models import FTRL, FM_FTRL

from nltk.corpus import stopwords
import re

#### 定义全局变量参数

In [2]:
NUM_BRANDS = 4500
NUM_CATEGORIES = 1250

develop = False
# develop= True

#### 定义rmsle误差函数（这里的Y和Y_pred尚未取对数，需要在函数内部取对数）

In [3]:
def rmsle(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y0), 2)))

#### 定义分割三级品牌字段的函数

In [4]:
def split_cat(text):
    try:
        return text.split("/")
    except:
        return ("No Label", "No Label", "No Label")

#### 处理相关字段的缺失值用“missing”填充

In [5]:
def handle_missing_inplace(dataset):
    dataset['general_cat'].fillna(value='missing', inplace=True)
    dataset['subcat_1'].fillna(value='missing', inplace=True)
    dataset['subcat_2'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='missing', inplace=True)

#### cutting函数，求数据集中相关字段中不等于‘missing’的出现频率最高的前NUM_BRANDS的值，然后将该字段不在前NUM_BRANDS的值赋值为‘missing’

In [6]:
def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category1 = dataset['general_cat'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    pop_category2 = dataset['subcat_1'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    pop_category3 = dataset['subcat_2'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    dataset.loc[~dataset['general_cat'].isin(pop_category1), 'general_cat'] = 'missing'
    dataset.loc[~dataset['subcat_1'].isin(pop_category2), 'subcat_1'] = 'missing'
    dataset.loc[~dataset['subcat_2'].isin(pop_category3), 'subcat_2'] = 'missing'

#### 转换字段数据类型为category类型

In [7]:
def to_categorical(dataset):
    dataset['general_cat'] = dataset['general_cat'].astype('category')
    dataset['subcat_1'] = dataset['subcat_1'].astype('category')
    dataset['subcat_2'] = dataset['subcat_2'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')

#### 定义文本正则的函数,首先获取停用词表，然后定义正则函数，并制定规则过滤词句
##### 原始：text = 'Jia wEn QI @#)(:">)'
##### 结果：u'jia wen qi'

In [8]:
# Define helpers for text normalization
stopwords = {x: 1 for x in stopwords.words('english')}
non_alphanums = re.compile(u'[^A-Za-z0-9]+')


def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])

In [9]:
print(stopwords)

{u'all': 1, u'just': 1, u"don't": 1, u'being': 1, u'over': 1, u'both': 1, u'through': 1, u'yourselves': 1, u'its': 1, u'before': 1, u'o': 1, u'don': 1, u'hadn': 1, u'herself': 1, u'll': 1, u'had': 1, u'should': 1, u'to': 1, u'only': 1, u'won': 1, u'under': 1, u'ours': 1, u'has': 1, u"should've": 1, u"haven't": 1, u'do': 1, u'them': 1, u'his': 1, u'very': 1, u"you've": 1, u'they': 1, u'not': 1, u'during': 1, u'now': 1, u'him': 1, u'nor': 1, u"wasn't": 1, u'd': 1, u'did': 1, u'didn': 1, u'this': 1, u'she': 1, u'each': 1, u'further': 1, u"won't": 1, u'where': 1, u"mustn't": 1, u"isn't": 1, u'few': 1, u'because': 1, u"you'd": 1, u'doing': 1, u'some': 1, u'hasn': 1, u"hasn't": 1, u'are': 1, u'our': 1, u'ourselves': 1, u'out': 1, u'what': 1, u'for': 1, u"needn't": 1, u'below': 1, u're': 1, u'does': 1, u"shouldn't": 1, u'above': 1, u'between': 1, u'mustn': 1, u't': 1, u'be': 1, u'we': 1, u'who': 1, u"mightn't": 1, u"doesn't": 1, u'were': 1, u'here': 1, u'shouldn': 1, u'hers': 1, u"aren't": 1,

### 开始整个流程的处理和训练

In [10]:
start_time = time.time()
from time import gmtime, strftime
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

# if 1 == 1:
train = pd.read_table('./train.tsv', engine='c')
test = pd.read_table('./test.tsv', engine='c')

2018-02-20 10:06:26


#### 取部分数据做测试

In [11]:
train = train[:2000]
test = test[:200]

In [12]:
print('[{}] Finished to load data'.format(time.time() - start_time))
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)

[7.25683999062] Finished to load data
('Train shape: ', (2000, 8))
('Test shape: ', (200, 7))


#### 训练集中删除price小于1.0的数据,并将小于1.0的数据去除price这个字段

In [13]:
nrow_test = train.shape[0]  # -dftt.shape[0]
dftt = train[(train.price < 1.0)]
train = train.drop(train[(train.price < 1.0)].index)
del dftt['price']

In [14]:
dftt

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,shipping,item_description
1325,1325,Alabama Crimson Tide Quality Lanyard,1,Sports & Outdoors/Fan Shop/NCAA,,1,TOP QUALITY THICK LANYARD Reversible sides wit...


In [15]:
nrow_train = train.shape[0]
# print(nrow_train, nrow_test)
y = np.log1p(train["price"])
merge = pd.concat([train, dftt, test])
submission = test[['test_id']]

In [16]:
submission

Unnamed: 0,test_id
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [17]:
print(y)

0       2.397895
1       3.970292
2       2.397895
3       3.583519
4       3.806662
5       4.094345
6       4.174387
7       1.945910
8       2.995732
9       2.197225
10      2.197225
11      3.555348
12      2.833213
13      1.609438
14      3.784190
15      2.484907
16      1.945910
17      3.401197
18      3.258097
19      3.332205
20      2.639057
21      3.044522
22      5.958425
23      2.197225
24      2.397895
25      4.204693
26      2.639057
27      3.218876
28      1.791759
29      2.833213
          ...   
1970    3.713572
1971    7.317876
1972    1.945910
1973    2.833213
1974    4.553877
1975    2.397895
1976    2.564949
1977    2.833213
1978    4.465908
1979    1.791759
1980    2.197225
1981    2.833213
1982    2.639057
1983    2.197225
1984    3.737670
1985    3.135494
1986    2.197225
1987    5.056246
1988    3.912023
1989    2.079442
1990    2.639057
1991    2.639057
1992    2.708050
1993    2.484907
1994    2.708050
1995    2.944439
1996    2.708050
1997    2.3025

In [18]:
merge.head(-5)

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
0,,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0
3,,Home/Home Décor/Home Décor Accents,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0
4,,Women/Jewelry/Necklaces,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0
5,,Women/Other/Other,3,"Banana republic bottoms, Candies skirt with ma...",Bundled items requested for Ruie,59.0,0,,5.0
6,Acacia Swimwear,Women/Swimwear/Two-Piece,3,Size small but straps slightly shortened to fi...,Acacia pacific tides santorini top,64.0,0,,6.0
7,Soffe,Sports & Outdoors/Apparel/Girls,3,You get three pairs of Sophie cheer shorts siz...,Girls cheer and tumbling bundle of 7,6.0,1,,7.0
8,Nike,Sports & Outdoors/Apparel/Girls,3,Girls Size small Plus green. Three shorts total.,Girls Nike Pro shorts,19.0,0,,8.0
9,,Vintage & Collectibles/Collectibles/Doll,3,I realized his pants are on backwards after th...,Porcelain clown doll checker pants VTG,8.0,0,,9.0


In [19]:
del train
del test
gc.collect()

835

#### 分割category_name这个字段得到'general_cat'、'subcat_1'以及'subcat_2'三个字段，然后去除'category_name'这个字段

In [20]:
merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
    zip(*merge['category_name'].apply(lambda x: split_cat(x)))
merge.drop('category_name', axis=1, inplace=True)
print('[{}] Split categories completed.'.format(time.time() - start_time))

[7.68763279915] Split categories completed.


In [21]:
merge.head(5)

Unnamed: 0,brand_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,general_cat,subcat_1,subcat_2
0,,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0,Men,Tops,T-shirts
1,Razer,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0,Electronics,Computers & Tablets,Components & Parts
2,Target,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0,Women,Tops & Blouses,Blouse
3,,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0,Home,Home Décor,Home Décor Accents
4,,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0,Women,Jewelry,Necklaces


#### 处理缺失值

In [22]:
handle_missing_inplace(merge)
print('[{}] Handle missing completed.'.format(time.time() - start_time))

[7.71670389175] Handle missing completed.


In [23]:
merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2200 entries, 0 to 199
Data columns (total 11 columns):
brand_name           2200 non-null object
item_condition_id    2200 non-null int64
item_description     2200 non-null object
name                 2200 non-null object
price                1999 non-null float64
shipping             2200 non-null int64
test_id              200 non-null float64
train_id             2000 non-null float64
general_cat          2200 non-null object
subcat_1             2200 non-null object
subcat_2             2200 non-null object
dtypes: float64(3), int64(2), object(6)
memory usage: 206.2+ KB


In [24]:
cutting(merge)
print('[{}] Cut completed.'.format(time.time() - start_time))

[7.75743579865] Cut completed.


In [25]:
merge.head(5)

Unnamed: 0,brand_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,general_cat,subcat_1,subcat_2
0,missing,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0,Men,Tops,T-shirts
1,Razer,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0,Electronics,Computers & Tablets,Components & Parts
2,Target,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0,Women,Tops & Blouses,Blouse
3,missing,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0,Home,Home Décor,Home Décor Accents
4,missing,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0,Women,Jewelry,Necklaces


In [26]:
to_categorical(merge)
print('[{}] Convert categorical completed'.format(time.time() - start_time))

[7.79162478447] Convert categorical completed


#### 针对‘name’用wordbatch.WordBatch生成hash向量

In [27]:
wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0],
                                                              "hash_size": 2 ** 29, "norm": None, "tf": 'binary',
                                                              "idf": None,
                                                              }), procs=8)

In [28]:
merge['name']

0           MLB Cincinnati Reds T Shirt Size XL
1              Razer BlackWidow Chroma Keyboard
2                                AVA-VIV Blouse
3                         Leather Horse Statues
4                          24K GOLD plated rose
5              Bundled items requested for Ruie
6            Acacia pacific tides santorini top
7          Girls cheer and tumbling bundle of 7
8                         Girls Nike Pro shorts
9        Porcelain clown doll checker pants VTG
10                              Smashbox primer
11                       New vs pi k body mists
12                           Black Skater dress
13                         Sharpener and eraser
14           HOLD for Dogs2016 Minnetonka boots
15                  Sephora tarte birthday gift
16                            Glitter Eyeshadow
17          New: Baby K'tan active baby carrier
18          Too Faced Limited "Merry Macaroons"
19               Cream/ Beige Front Cross Shirt
20              Torrid Nautical Peplum T

In [29]:
wb.dictionary_freeze= True
X_name = wb.fit_transform(merge['name'])
del(wb)
X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

Normalize text
Extract wordbags
[14.6450269222] Vectorize `name` completed.


#### 针对'general_cat'、'subcat_1'以及'subcat_2'用tf方法生成向量

In [30]:
wb = CountVectorizer()
X_category1 = wb.fit_transform(merge['general_cat'])
X_category2 = wb.fit_transform(merge['subcat_1'])
X_category3 = wb.fit_transform(merge['subcat_2'])
print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))

[14.7122879028] Count vectorize `categories` completed.


In [31]:
print(X_category1)

  (0, 7)	1
  (1, 2)	1
  (2, 13)	1
  (3, 4)	1
  (4, 13)	1
  (5, 13)	1
  (6, 13)	1
  (7, 10)	1
  (7, 11)	1
  (8, 10)	1
  (8, 11)	1
  (9, 1)	1
  (9, 12)	1
  (10, 0)	1
  (11, 0)	1
  (12, 13)	1
  (13, 9)	1
  (14, 13)	1
  (15, 0)	1
  (16, 0)	1
  (17, 5)	1
  (18, 0)	1
  (19, 13)	1
  (20, 13)	1
  (21, 13)	1
  :	:
  (2177, 5)	1
  (2178, 5)	1
  (2179, 2)	1
  (2180, 5)	1
  (2181, 13)	1
  (2182, 13)	1
  (2183, 13)	1
  (2184, 5)	1
  (2185, 1)	1
  (2185, 12)	1
  (2186, 13)	1
  (2187, 7)	1
  (2188, 13)	1
  (2189, 0)	1
  (2190, 5)	1
  (2191, 7)	1
  (2192, 13)	1
  (2193, 13)	1
  (2194, 5)	1
  (2195, 10)	1
  (2195, 11)	1
  (2196, 2)	1
  (2197, 0)	1
  (2198, 13)	1
  (2199, 9)	1


In [32]:
#### 针对'item_description'，用wordbatch.WordBatch来生成hash向量

In [33]:
wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                              "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
                                                              "idf": None}), procs=8)

In [34]:
wb.dictionary_freeze= True
X_description = wb.fit_transform(merge['item_description'])
del(wb)
X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time))

Normalize text
Extract wordbags
[18.5308198929] Vectorize `item_description` completed.


#### 针对'brand_name'字段将品牌名二值稀疏化

In [35]:
lb = LabelBinarizer(sparse_output=True)###sparse_output产生csr稀疏矩阵形式
X_brand = lb.fit_transform(merge['brand_name'])
print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))

[18.5465028286] Label binarize `brand_name` completed.


In [36]:
print(X_brand)
X_brand

  (0, 340)	1
  (1, 259)	1
  (2, 292)	1
  (3, 340)	1
  (4, 340)	1
  (5, 340)	1
  (6, 5)	1
  (7, 281)	1
  (8, 228)	1
  (9, 340)	1
  (10, 280)	1
  (11, 316)	1
  (12, 342)	1
  (13, 269)	1
  (14, 308)	1
  (15, 293)	1
  (16, 321)	1
  (17, 340)	1
  (18, 301)	1
  (19, 21)	1
  (20, 302)	1
  (21, 316)	1
  (22, 266)	1
  (23, 340)	1
  (24, 97)	1
  :	:
  (2175, 340)	1
  (2176, 178)	1
  (2177, 111)	1
  (2178, 308)	1
  (2179, 340)	1
  (2180, 109)	1
  (2181, 340)	1
  (2182, 316)	1
  (2183, 308)	1
  (2184, 101)	1
  (2185, 340)	1
  (2186, 340)	1
  (2187, 224)	1
  (2188, 340)	1
  (2189, 340)	1
  (2190, 228)	1
  (2191, 60)	1
  (2192, 180)	1
  (2193, 340)	1
  (2194, 340)	1
  (2195, 2)	1
  (2196, 241)	1
  (2197, 316)	1
  (2198, 205)	1
  (2199, 340)	1


<2200x344 sparse matrix of type '<type 'numpy.int64'>'
	with 2200 stored elements in Compressed Sparse Row format>

In [37]:
X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                          sparse=True).values)

In [38]:
AA = pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                          sparse=True)
AA

Unnamed: 0,shipping,item_condition_id_1,item_condition_id_2,item_condition_id_3,item_condition_id_4,item_condition_id_5
0,1,0,0,1,0,0
1,0,0,0,1,0,0
2,1,1,0,0,0,0
3,1,1,0,0,0,0
4,0,1,0,0,0,0
5,0,0,0,1,0,0
6,0,0,0,1,0,0
7,1,0,0,1,0,0
8,0,0,0,1,0,0
9,0,0,0,1,0,0


In [39]:
AA.values

array([[1, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 0],
       ..., 
       [0, 0, 0, 1, 0, 0],
       [1, 0, 1, 0, 0, 0],
       [1, 0, 0, 1, 0, 0]])

In [40]:
print(X_dummies)
X_dummies

  (0, 0)	1
  (0, 3)	1
  (1, 3)	1
  (2, 0)	1
  (2, 1)	1
  (3, 0)	1
  (3, 1)	1
  (4, 1)	1
  (5, 3)	1
  (6, 3)	1
  (7, 0)	1
  (7, 3)	1
  (8, 3)	1
  (9, 3)	1
  (10, 0)	1
  (10, 2)	1
  (11, 1)	1
  (12, 2)	1
  (13, 0)	1
  (13, 1)	1
  (14, 3)	1
  (15, 0)	1
  (15, 1)	1
  (16, 0)	1
  (16, 1)	1
  :	:
  (2184, 1)	1
  (2185, 0)	1
  (2185, 2)	1
  (2186, 0)	1
  (2186, 1)	1
  (2187, 0)	1
  (2187, 1)	1
  (2188, 3)	1
  (2189, 0)	1
  (2189, 1)	1
  (2190, 0)	1
  (2190, 3)	1
  (2191, 2)	1
  (2192, 4)	1
  (2193, 2)	1
  (2194, 3)	1
  (2195, 0)	1
  (2195, 3)	1
  (2196, 0)	1
  (2196, 1)	1
  (2197, 3)	1
  (2198, 0)	1
  (2198, 2)	1
  (2199, 0)	1
  (2199, 3)	1


<2200x6 sparse matrix of type '<type 'numpy.int64'>'
	with 3189 stored elements in Compressed Sparse Row format>

In [41]:
print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time))

[18.6549220085] Get dummies on `item_condition_id` and `shipping` completed.


In [42]:
print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)

((2200, 6), (2200, 5810), (2200, 344), (2200, 14), (2200, 116), (2200, 397), (2200, 1584))


In [43]:
sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr()
print('[{}] Create sparse merge completed'.format(time.time() - start_time))

[18.6794970036] Create sparse merge completed


In [44]:
sparse_merge

<2200x8271 sparse matrix of type '<type 'numpy.float64'>'
	with 63386 stored elements in Compressed Sparse Row format>

In [45]:
# Remove features with document frequency <=1
print(sparse_merge.shape)
mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
sparse_merge = sparse_merge[:, mask]
X = sparse_merge[:nrow_train]
X_test = sparse_merge[nrow_test:]
print(sparse_merge.shape)

(2200, 8271)
(2200, 7966)


#### 分割数据集为训练集和验证集

In [46]:
gc.collect()
train_X, train_y = X, y
if develop:
    train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)

#### FTRL模型

In [47]:
model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1)
model.fit(train_X, train_y)
print('[{}] Train FTRL completed'.format(time.time() - start_time))
if develop:
    preds = model.predict(X=valid_X)
    print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))
predsF = model.predict(X_test)
print('[{}] Predict FTRL completed'.format(time.time() - start_time))

[832.665651798] Train FTRL completed
[832.666449785] Predict FTRL completed


#### FM_FTRL模型

In [48]:
model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01,
                D_fm=200, e_noise=0.0001, iters=17, inv_link="identity", threads=4)

model.fit(train_X, train_y)
print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
if develop:
    preds = model.predict(X=valid_X)
    print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

predsFM = model.predict(X_test)
print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

[836.134282827] Train ridge v2 completed
[836.13704586] Predict FM_FTRL completed


#### lgb模型

In [49]:
params = {
        'learning_rate': 0.6,
        'application': 'regression',
        'max_depth': 4,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 0.6,
        'bagging_freq': 5,
        'feature_fraction': 0.65,
        'nthread': 4,
        'min_data_in_leaf': 100,
        'max_bin': 31
    }

In [50]:
# Remove features with document frequency <=100
print(sparse_merge.shape)
mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1), dtype=bool)
sparse_merge = sparse_merge[:, mask]
X = sparse_merge[:nrow_train]
X_test = sparse_merge[nrow_test:]
print(sparse_merge.shape)

(2200, 7966)
(2200, 65)


In [51]:
sparse_merge

<2200x65 sparse matrix of type '<type 'numpy.float64'>'
	with 15820 stored elements in Compressed Sparse Row format>

In [52]:
print(sparse_merge)

  (0, 0)	1.0
  (0, 3)	1.0
  (0, 10)	-0.577350258827
  (0, 30)	-0.577350258827
  (0, 40)	0.577350258827
  (0, 43)	1.0
  (0, 47)	1.0
  (0, 56)	1.0
  (0, 60)	1.0
  (0, 62)	-1.5
  (1, 3)	1.0
  (1, 12)	-0.169030845165
  (1, 19)	-0.169030845165
  (1, 20)	0.169030845165
  (1, 33)	0.169030845165
  (1, 36)	0.169030845165
  (1, 45)	1.0
  (2, 0)	1.0
  (2, 1)	1.0
  (2, 8)	0.192450091243
  (2, 16)	-0.192450091243
  (2, 38)	0.192450091243
  (2, 48)	1.0
  (2, 52)	1.0
  (2, 56)	1.0
  :	:
  (2196, 30)	-0.577350258827
  (2196, 40)	0.577350258827
  (2196, 45)	1.0
  (2196, 49)	1.0
  (2197, 3)	1.0
  (2197, 44)	1.0
  (2197, 54)	1.0
  (2198, 0)	1.0
  (2198, 2)	1.0
  (2198, 9)	-0.185695335269
  (2198, 24)	0.185695335269
  (2198, 35)	0.185695335269
  (2198, 48)	1.0
  (2199, 0)	1.0
  (2199, 3)	1.0
  (2199, 7)	0.0871305540204
  (2199, 12)	-0.116174072027
  (2199, 17)	0.0871305540204
  (2199, 22)	0.0871305540204
  (2199, 26)	-0.116174072027
  (2199, 27)	-0.0871305540204
  (2199, 29)	-0.0871305540204
  (2199, 31)	

In [53]:
train_X, train_y = X, y
if develop:
    train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)

#### 转换成lgb需要的格式

In [54]:
d_train = lgb.Dataset(train_X, label=train_y)
watchlist = [d_train]
if develop:
    d_valid = lgb.Dataset(valid_X, label=valid_y)
    watchlist = [d_train, d_valid]

In [55]:
model = lgb.train(params, train_set=d_train, num_boost_round=7000, valid_sets=watchlist, \
                      early_stopping_rounds=1000, verbose_eval=1000)

if develop:
    preds = model.predict(valid_X)
    print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

predsL = model.predict(X_test)

print('[{}] Predict LGB completed.'.format(time.time() - start_time))


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


Training until validation scores don't improve for 1000 rounds.
[1000]	training's rmse: 0.574884
[2000]	training's rmse: 0.541089
[3000]	training's rmse: 0.520081
[4000]	training's rmse: 0.504528
[5000]	training's rmse: 0.491559
[6000]	training's rmse: 0.481849
[7000]	training's rmse: 0.4726
[1092.82839584] Predict LGB completed.
