In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import gc
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [2]:
max_num_features = 10
pad_size = 1
boundary_letter = -1
space_letter = 0
# max_data_size = 320000
self_classes = ["PLAIN", "PUNCT"]

In [3]:
param = {'objective': 'multi:softmax',
             'eta': '0.3',
             'max_depth': 10,
             'silent': 1,
             'nthread': -1,
             # 'num_class':num_class,
             'num_class': 2,
             'eval_metric': 'merror'}

In [4]:
def context_window_transform(data, pad_size):
    """每个词加上前面一个和后面一个词，中间用-1隔开"""
    pre = np.zeros(max_num_features)
    pre = [pre for x in np.arange(pad_size)]
    data = pre + data + pre
    neo_data = []
    for i in np.arange(len(data) - pad_size * 2):
        row = []
        for x in data[i : i + pad_size * 2 + 1]:
            row += [boundary_letter]
            row += x.tolist()
        row += [boundary_letter]
        neo_data.append(row)
    return neo_data


In [15]:
def train():
    print("open data files ...")
    train_df = pd.read_csv('C:/Users/CHICHI/Desktop/文本正则化/input/en_train.csv')

    print("data processing...")
    x_data = []
    # 将类别数字化
    # y_data = pd.factorize(train_df['class'])
    # labels = y_data[1]
    # y_data = y_data[0]
    labels = train_df["class"].unique()
    class2index = dict(zip(labels, range(len(labels))))
    for k in class2index:
        if k in self_classes:
            class2index[k] = 0
        else:
            class2index[k] = 1
    y_data = list(map(lambda c: class2index[c], train_df['class'].values))
    gc.collect()
    # 每个目标词用组成这个词的所有字符的ascii码表示，并padding
    for x in train_df['before'].values:
        x_row = np.ones(max_num_features, dtype=int) * space_letter
        for xi, i in zip(list(str(x)), np.arange(max_num_features)):
            x_row[i] = ord(xi)
        x_data.append(x_row)
    del train_df
    gc.collect()

    x_data_context = np.array(context_window_transform(x_data, pad_size))
    del x_data
    gc.collect()
    x_data_context_a = np.array(x_data_context)
    y_data_a = np.array(y_data)

    print('Total number of samples:', len(x_data_context))

    print('x_data sample:')
    print(x_data_context[0])
    print('y_data sample:')
    print(y_data[0])
    print('labels:')
    print(labels)

    del x_data_context
    del y_data
    gc.collect()

    x_train, x_valid, y_train, y_valid= train_test_split(x_data_context_a, y_data_a,
                                                          test_size=0.1, random_state=2017)
    del x_data_context_a
    del y_data_a
    gc.collect()

    print("forming dmatrix...")
    num_class = len(labels)
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dvalid = xgb.DMatrix(x_valid, label=y_valid)
    watchlist = [(dvalid, 'valid'), (dtrain, 'train')]

    del x_train
    del y_train
    gc.collect()

    print("training start...")
    model = xgb.train(param, dtrain, 10, watchlist, early_stopping_rounds=20,
                      verbose_eval=10)
    ypred = model.predict(dvalid)
    model.save_model('C:/Users/CHICHI/Desktop/文本正则化/model_vars/train.model')
    model.dump_model('C:/Users/CHICHI/Desktop/文本正则化/model_vars/dump.train.txt')
    accuracy_score(ypred, y_valid)
    print(classification_report(ypred,y_valid))

In [None]:
train()

open data files ...
data processing...
Total number of samples: 9918441
x_data sample:
[ -1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  -1.  66. 114.
 105. 108. 108.  97. 110. 116.  97. 105.  -1. 105. 115.   0.   0.   0.
   0.   0.   0.   0.   0.  -1.]
y_data sample:
0
labels:
['PLAIN' 'PUNCT' 'DATE' 'LETTERS' 'CARDINAL' 'VERBATIM' 'DECIMAL'
 'MEASURE' 'MONEY' 'ORDINAL' 'TIME' 'ELECTRONIC' 'DIGIT' 'FRACTION'
 'TELEPHONE' 'ADDRESS']
forming dmatrix...
training start...
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	valid-merror:0.00684	train-merror:0.00677
Multiple eval metrics have been passed: 'train-merror' will be used for early stopping.

Will train until train-merror hasn't improved in 20 rounds.


In [13]:
from sklearn.metrics import accuracy_score,f1_score,auc,classification_report,roc_auc_score

def test(model_file='C:/Users/CHICHI/Desktop/文本正则化/model_vars/train.model'):
    test_df = pd.read_csv('C:/Users/CHICHI/Desktop/文本正则化/input/en_test_2.csv')
    # 每个目标词用组成这个词的所有字符的ascii码表示，并padding
    x_data = []
    for x in test_df['before'].values:
        x_row = np.ones(max_num_features, dtype=int) * space_letter
        for xi, i in zip(list(str(x)), np.arange(max_num_features)):
            x_row[i] = ord(xi)
        x_data.append(x_row)

    x_data_context = np.array(context_window_transform(x_data, pad_size))
    x_data_context_a = np.array(x_data_context)
    dtest = xgb.DMatrix(x_data_context_a)

    bst = xgb.Booster(param)  # init model
    bst.load_model(model_file)
    ypred = bst.predict(dtest)
    print("ypred:", type(ypred), np.shape(ypred))
    print(test_df.shape)
    # test_df["id"] = test_df[["sentence_id", "token_id"]].apply(lambda row: axis=1)
    print(test_df["sentence_id"].values.shape, test_df["sentence_id"].values.dtype)
    ids_a = np.array(map(lambda tup: str(tup[0]) + "_" + str(tup[1]),
                         zip(test_df["sentence_id"].values,
                             test_df["token_id"].values)))
    print("ids_a: ", ids_a.shape)
    test_df["id"] = ids_a
    # test_df.drop(["sentence_id", "token_id"])
    class_df = test_df[["id", "before"]]
    class_df["class_pred"] = ypred
    # class_df = class_df[["id", "before", "class_pred"]]
    class_df.to_csv("C:/Users/CHICHI/Desktop/文本正则化/output/class_pred.csv", index=False)

In [14]:
test()

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


ypred: <class 'numpy.ndarray'> (956046,)
(956046, 3)
(956046,) int64
ids_a:  ()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  class_df["class_pred"] = ypred


TypeError: Expected sequence or array-like, got <class 'xgboost.core.DMatrix'>