In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import os
import pickle
import gc
import xgboost as xgb
import re
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

In [2]:
max_num_features = 10
pad_size = 1
boundary_letter = -1 
#It's to differentiate the words in one sliding window :) Because all characters are greater than 0 so using -1 will 
#help decision trees to distinguish bondaries better.
space_letter = 0
round_num = 50

In [3]:
train_file_name = "C:/Users/CHICHI/Desktop/文本正则化/input/en_train.csv"
test_file_name = "C:/Users/CHICHI/Desktop/文本正则化/input/en_test_2.csv"
model_file_name = "C:/Users/CHICHI/Desktop/文本正则化/model_vars/train16.v2.6.model"
model_dump_name = "C:/Users/CHICHI/Desktop/文本正则化/model_vars/dump.train16.v2.6.txt"
class_pred_file_name = "C:/Users/CHICHI/Desktop/文本正则化/output/class_pred_16.v2.6.csv"
all_pred_file_name = "C:/Users/CHICHI/Desktop/文本正则化/output/train_pred.v2.6.csv"
valid_compare_file_name = "C:/Users/CHICHI/Desktop/文本正则化/output/valid_compare_16.v2.6.csv"
train_compare_file_name = "C:/Users/CHICHI/Desktop/文本正则化/output/train_compare_16.v2.6.csv"

In [4]:
param = {'objective': 'multi:softprob',
         'eta': '0.2',
         'max_depth': 11,
         'silent': True,
         'num_class': 16,
         'eval_metric': 'merror'}

In [5]:
labels = ['PLAIN', 'PUNCT', 'DATE', 'LETTERS', 'CARDINAL', 'VERBATIM',
          'DECIMAL', 'MEASURE', 'MONEY', 'ORDINAL', 'TIME', 'ELECTRONIC',
          'DIGIT', 'FRACTION', 'TELEPHONE', 'ADDRESS']


In [6]:
weight_dict = {'PLAIN': 0.01,
               'PUNCT': 1,
               'DATE': 1,
               'LETTERS': 1,
               'CARDINAL': 1,
               'VERBATIM': 1,
               'DECIMAL': 1,
               'MEASURE': 1,
               'MONEY': 1,
               'ORDINAL': 1,
               'TIME': 1,
               'ELECTRONIC': 1,
               'DIGIT': 1,
               'FRACTION': 1,
               'TELEPHONE': 1,
               'ADDRESS': 1}


In [7]:
def context_window_transform(data, pad_size):
    """每个词加上前面一个和后面一个词，中间用-1隔开"""
    pre = np.zeros(max_num_features, dtype=int)
    pre = [pre for x in np.arange(pad_size)]
    data = pre + data + pre
    # print(data)
    new_data = []
    for i in np.arange(len(data) - pad_size * 2):
        if np.all(data[i + pad_size] == 0):
            continue
        row = []
        for x in data[i: i + pad_size * 2 + 1]:
            row += [boundary_letter]
            row += x.tolist()
        row += [boundary_letter]
        new_data.append(row)
    return new_data
"""
(1)
It groups previous word and posterior word into a fixed size array. Because the decision tree will predict more accurately 
with the context of the word than without context, same as a recurrent neural network. And it has to be a fixed-size array 
because XGboost doesn't accept variable input length.
(2)
It use ASCII encoding instead of other encodings (eg. TF-IDF) because I think labeling the words doesn't require decision 
tree to understand the meaning of each words, that means a simple ASCII encoding will do the trick.
(3)
And tt requires every word to include its previous and posterior words (if a word doesn't have a previous word or posterior 
word, then use 0 to represent those missing words).Because we need to tell decision tree that it needs to make predictions 
based mostly on the middle word. Say we have arrays like this: array A(a cat is), B(cat is 10), C(is 10 years) D(10 years old). 
We need XGBoost to tell use A is LETTER, B is LETTER and C is NUMBER. And XGBoost can see the difference between A and C in that 
C's middle word is not the same type as A's.
"""

"\n(1)\nIt groups previous word and posterior word into a fixed size array. Because the decision tree will predict more accurately \nwith the context of the word than without context, same as a recurrent neural network. And it has to be a fixed-size array \nbecause XGboost doesn't accept variable input length.\n(2)\nIt use ASCII encoding instead of other encodings (eg. TF-IDF) because I think labeling the words doesn't require decision \ntree to understand the meaning of each words, that means a simple ASCII encoding will do the trick.\n(3)\nAnd tt requires every word to include its previous and posterior words (if a word doesn't have a previous word or posterior \nword, then use 0 to represent those missing words).Because we need to tell decision tree that it needs to make predictions \nbased mostly on the middle word. Say we have arrays like this: array A(a cat is), B(cat is 10), C(is 10 years) D(10 years old). \nWe need XGBoost to tell use A is LETTER, B is LETTER and C is NUMBER. A

In [8]:
def get_feas(data_df):
    # 特征工程
    feas = []
    # 1.token的长度
    fea_len = data_df["before"].apply(lambda token: len(str(token))).values
    feas.append(fea_len)
    # 2.是否是每句话的第一个token
    fea_start = data_df["token_id"].apply(lambda token_id: 1 if int(token_id) == 0 else 0)
    feas.append(fea_start)
    fea_x = np.transpose(np.vstack(feas))
    return fea_x


In [9]:
def train(with_valid=True, save=True):
    print("open data files ...")
    train_df = pd.read_csv(train_file_name)

    print("data processing...")
    x_data = []
    # 将类别数字化
    # labels = train_df["class"].unique()
    class2index = dict(zip(labels, range(len(labels))))
    y_data = list(map(lambda c: class2index[c], train_df['class'].values))# 'map' object is not subscriptable, 需要转换成list
    gc.collect()
    # 每个目标词用组成这个词的所有字符的ascii码表示，并padding
    for x, token_id in zip(train_df['before'].values, train_df["token_id"].values):
        if token_id == 0:
            x_row_before = np.zeros(max_num_features, dtype=int)
            x_data.append(x_row_before)
        x_row = np.ones(max_num_features, dtype=int) * space_letter

        for xi, i in zip(list(str(x)), np.arange(max_num_features)):
            x_row[i] = ord(xi)
        x_data.append(x_row)

    fea_x = get_feas(train_df)

    del train_df
    gc.collect()

    x_data_context = np.array(context_window_transform(x_data, pad_size))
    del x_data
    gc.collect()
    # x_data_context_a = np.array(x_data_context)
    x_data_context_a = np.hstack([x_data_context, fea_x])
    y_data_a = np.array(list(y_data))

    # 计算每个类别的权重
    print(np.unique(y_data_a))
    index_weight_dict = dict([(class2index[k], v)for k, v in weight_dict.items()])
    class_weights = class_weight.compute_class_weight("balanced", np.arange(16), y_data_a)# array不接收迭代器，需要转换成list
    weights = np.array(map(lambda y: class_weights[y], y_data_a))
    # print("weights: ", weights[:100])
    print('Total number of samples:', len(x_data_context))

    print('x_data sample:')
    print(x_data_context[0])
    print('y_data sample:')
    print(y_data[0])
    print('labels:')
    print(labels)

    del x_data_context
    del y_data
    gc.collect()

    if with_valid:
        x_train, x_valid, y_train, y_valid = train_test_split(x_data_context_a, y_data_a,
                                                              test_size=0.01, random_state=2017)
        del x_data_context_a
        del y_data_a
        gc.collect()

        print("forming dmatrix...")
        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_valid, label=y_valid)
        watchlist = [(dvalid, 'valid'), (dtrain, 'train')]

        del x_train
        del y_train
        gc.collect()

        print("training start...")
        print("params: ", param)
        # print("loading model ...")
        model = xgb.train(param, dtrain, round_num, watchlist,
                          # xgb_model="C:/Users/CHICHI/Desktop/文本正则化/output/train16.v2.5.model",
                          early_stopping_rounds=10,
                          verbose_eval=10)
    else:
        dtrain = xgb.DMatrix(x_data_context_a, label=y_data_a)
        watchlist = [(dtrain, 'train')]
        del x_data_context_a
        del y_data_a
        gc.collect()
        model = xgb.train(param, dtrain, round_num, watchlist, early_stopping_rounds=20,
                          verbose_eval=10)
    if save:
        model.save_model(model_file_name)
        model.dump_model(model_dump_name)


In [21]:
train()

open data files ...
data processing...
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
Total number of samples: 9918441
x_data sample:
[ -1   0   0   0   0   0   0   0   0   0   0  -1  66 114 105 108 108  97
 110 116  97 105  -1 105 115   0   0   0   0   0   0   0   0  -1]
y_data sample:
0
labels:
['PLAIN', 'PUNCT', 'DATE', 'LETTERS', 'CARDINAL', 'VERBATIM', 'DECIMAL', 'MEASURE', 'MONEY', 'ORDINAL', 'TIME', 'ELECTRONIC', 'DIGIT', 'FRACTION', 'TELEPHONE', 'ADDRESS']
forming dmatrix...
training start...
params:  {'objective': 'multi:softprob', 'eta': '0.2', 'max_depth': 11, 'silent': True, 'num_class': 16, 'eval_metric': 'merror'}
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	valid-merror:0.00888	train-merror:0.00844
Multiple eval metrics have been p

In [47]:
def test():
    test_df = pd.read_csv(test_file_name)
    # 每个目标词用组成这个词的所有字符的ascii码表示，并padding
    print("loading test data ...")
    x_data = []
    for x, token_id in zip(test_df['before'].values, test_df["token_id"].values):
        if token_id == 0:
            x_row_before = np.zeros(max_num_features, dtype=int)
            x_data.append(x_row_before)
        x_row = np.ones(max_num_features, dtype=int) * space_letter

        for xi, i in zip(list(str(x)), np.arange(max_num_features)):
            x_row[i] = ord(xi)
        x_data.append(x_row)

    fea_x = get_feas(test_df)
    x_data_context = np.array(context_window_transform(x_data, pad_size))
    # x_data_context_a = np.array(x_data_context)
    x_data_context_a = np.hstack([x_data_context, fea_x])
    dtest = xgb.DMatrix(x_data_context_a)
    print("loading model ...")
    bst = xgb.Booster(param)  # init model
    bst.load_model(model_file_name)
    print("start predicting ...")
    # ypred = bst.predict(dtest)

    yprob = bst.predict(dtest)
    ypred = np.argmax(yprob, axis=1)
    ymax_prob = np.max(yprob, axis=1)
    print("ypred:", np.shape(ypred))
    print("ymax_prob:", np.shape(ymax_prob))
    # print(test_df.shape)
    # print(test_df["sentence_id"].values.shape, test_df["sentence_id"].values.dtype)
    source_word = test_df.before
    sentence_id = test_df.sentence_id
    token_id = test_df.token_id
    ids_a = [str(""+str(e[0])+"_"+str(e[1])) for e in list(zip(sentence_id,token_id))]
    print("ids_a: ", ids_a)
    test_df["id"] = ids_a
    class_df = test_df[["id", "before"]]
    class_df["class_pred"] = ypred
    class_df["max_prob"] = ymax_prob
    class_df.to_csv(class_pred_file_name, index=False)


In [48]:
test()

loading test data ...
loading model ...
start predicting ...
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


ypred: (956046,)
ymax_prob: (956046,)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  class_df["class_pred"] = ypred
