-
Notifications
You must be signed in to change notification settings - Fork 3
/
lgb_char_300d_tuning.py
184 lines (150 loc) · 6.82 KB
/
lgb_char_300d_tuning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# _*_ coding: utf-8 _*_
"""
Long text classification by using LightGBM model.
Author: StrongXGP
Date: 2018/07/13
"""
import gc
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
# Global variables
PAD_STR = '<PAD>'
SEQUENCE_LENGTH = 3000
EMBEDDING_SIZE = 300
def load_char_samples_and_labels(data_path, has_header=True, is_train=True):
"""Load characters of each sample (document)."""
if has_header:
start_index = 1
else:
start_index = 0
with open(data_path, 'r', encoding='utf-8') as f:
lines = f.read().splitlines()[start_index:]
char_samples = [line.split(',')[1] for line in lines]
char_samples = [char_sample.split() for char_sample in char_samples]
if is_train:
labels = [int(line.split(',')[3]) for line in lines]
else:
labels = []
return char_samples, labels
def preprocess(data, sequence_length=3000):
"""Process the characters of each sample to a fixed length."""
res = []
for sample in data:
if len(sample) > sequence_length:
sample = sample[:sequence_length - 1]
res.append(sample)
else:
str_added = [PAD_STR] * (sequence_length - len(sample))
sample += str_added
res.append(sample)
return res
def generate_char_mapping(char_vectors_path):
"""Generate the mapping from characters to its corresponding vectors."""
char_to_vec_map = {PAD_STR: np.zeros(EMBEDDING_SIZE, dtype=np.float32)}
with open(char_vectors_path, 'r', encoding='utf-8') as f:
lines = f.read().splitlines()[1:]
lines = [line.split() for line in lines]
for line in lines:
word = line[0]
if word not in char_to_vec_map:
char_to_vec_map[word] = np.array(line[1:], dtype=np.float32)
return char_to_vec_map
def generate_features(sample, char_to_vec_map):
"""Generate features by adding character vectors of each character in the sample."""
np.random.seed(10)
res = []
for char in sample:
if char in char_to_vec_map:
res.append(char_to_vec_map[char])
else:
res.append(np.random.normal(size=(EMBEDDING_SIZE,)))
matrix = np.concatenate(res).reshape([len(sample), -1])
features = np.sum(matrix, axis=0)
return features
def main():
# Load data and process to a fixed length
# ============================================================================
print("Load data...")
train_data_file = "../../raw_data/train_set.csv"
test_data_file = "../../raw_data/test_set.csv"
char_samples_train, labels_train = load_char_samples_and_labels(train_data_file, has_header=True, is_train=True)
char_samples_test, _ = load_char_samples_and_labels(test_data_file, has_header=True, is_train=False)
print("Process each sample to a fixed length...")
char_samples_train = preprocess(char_samples_train, sequence_length=SEQUENCE_LENGTH)
char_samples_test = preprocess(char_samples_test, sequence_length=SEQUENCE_LENGTH)
# Generate the mapping from characters to its corresponding vectors
# ============================================================================
print("Generate the mapping from characters to its corresponding vectors...")
char_vectors_path = "../../word_vectors/all/datagrand-char-300d.txt"
char_to_vec_map = generate_char_mapping(char_vectors_path)
# Extract features and split data into training, validation and testing set
# ============================================================================
print("Extract features...")
num_train = len(char_samples_train)
char_samples = char_samples_train + char_samples_test
feature_vectors = []
for char_sample in char_samples:
feature_vector = generate_features(char_sample, char_to_vec_map)
feature_vectors.append(feature_vector)
print("Split data into training, validation and testing set...")
feature_vectors_train = feature_vectors[:num_train]
feature_vectors_test = feature_vectors[num_train:]
X = pd.DataFrame(feature_vectors_train, dtype=np.float32)
y = pd.Series(labels_train, dtype=np.int32) - 1
indices_shuffled = np.random.permutation(np.arange(num_train))
X_shuffled, y_shuffled = X.iloc[indices_shuffled], y.iloc[indices_shuffled]
X_train, X_val, y_train, y_val = train_test_split(X_shuffled, y_shuffled, train_size=0.8, random_state=42)
X_test = pd.DataFrame(feature_vectors_test, dtype=np.float32)
del char_samples_train, char_samples_test, char_samples, char_to_vec_map
del feature_vectors_train, feature_vectors_test, feature_vectors
del X, y, X_shuffled, y_shuffled
gc.collect()
# Train LightGBM model
# ============================================================================
lgb_train = lgb.Dataset(X_train.values, y_train.values)
lgb_val = lgb.Dataset(X_val.values, y_val.values, reference=lgb_train)
num_classes = max(labels_train)
params = {
'boosting_type': 'gbdt',
'objective': 'multiclass',
'num_class': num_classes,
'metric': 'multi_logloss',
'num_leaves': 15,
'max_depth': 4,
'learning_rate': 0.05,
'feature_fraction': 0.8,
# 'bagging_fraction': 0.8,
# 'bagging_freq': 5,
'verbose': 0
}
num_boost_round = 2000
feature_names = ['embed_' + str(col) for col in range(EMBEDDING_SIZE)]
print("Start training...")
start_time = time.time()
gbm = lgb.train(params,
lgb_train,
num_boost_round=num_boost_round,
valid_sets=lgb_val,
feature_name=feature_names,
early_stopping_rounds=30)
print("Training finished! ^_^")
print("Total seconds: %ds" % (time.time() - start_time))
# Calculate the f1 score and accuracy of training and validation set
probs_train = gbm.predict(X_train, num_iteration=gbm.best_iteration)
preds_train = np.argmax(probs_train, axis=1)
score_train = f1_score(y_train, preds_train, average='weighted')
acc_train = accuracy_score(y_train, preds_train)
print("The f1 score of training set after %d epochs is: %f" % (gbm.best_iteration, score_train))
print("The accuracy of training set after %d epochs is: %f" % (gbm.best_iteration, acc_train))
probs_val = gbm.predict(X_val, num_iteration=gbm.best_iteration)
preds_val = np.argmax(probs_val, axis=1)
score_val = f1_score(y_val, preds_val, average='weighted')
acc_val = accuracy_score(y_val, preds_val)
print("The f1 score of validation set after %d epochs is: %f" % (gbm.best_iteration, score_val))
print("The accuracy of validation set after %d epochs is: %f" % (gbm.best_iteration, acc_val))
if __name__ == '__main__':
main()