In [1]:
import gc
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
# Global variable
PAD_STR = '<PAD>'
SEQUENCE_LENGTH = 3000
EMBEDDING_SIZE = 300

In [3]:
def load_char_samples_and_labels(data_path, has_header=True, is_train=True):
    """Load characters of each sample (document)."""
    if has_header:
        start_index = 1
    else:
        start_index = 0

    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines()[start_index:]
        char_samples = [line.split(',')[1] for line in lines]
        char_samples = [char_sample.split() for char_sample in char_samples]

    if is_train:
        labels = [int(line.split(',')[3]) for line in lines]
    else:
        labels = []

    return char_samples, labels

In [4]:
def generate_char_mapping(char_vectors_path):
    """Generate the mapping from characters to its corresponding vectors."""
    char_to_vec_map = {PAD_STR: np.zeros(EMBEDDING_SIZE, dtype=np.float32)}
    with open(char_vectors_path, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines()[1:]
        lines = [line.split() for line in lines]
        for line in lines:
            word = line[0]
            if word not in char_to_vec_map:
                char_to_vec_map[word] = np.array(line[1:], dtype=np.float32)
    return char_to_vec_map

In [5]:
def preprocess(data, sequence_length=3000):
    """Process the characters of each sample to a fixed length."""
    res = []
    for sample in data:
        if len(sample) > sequence_length:
            sample = sample[:sequence_length - 1]
            res.append(sample)
        else:
            str_added = [PAD_STR] * (sequence_length - len(sample))
            sample += str_added
            res.append(sample)
    return res

In [6]:
def generate_features(sample, char_to_vec_map):
    """Generate features by adding character vectors of each character in the sample."""
    res = []
    for char in sample:
        if char in char_to_vec_map:
            res.append(char_to_vec_map[char])
        else:
            res.append(np.random.normal(size=(EMBEDDING_SIZE, )))
    matrix = np.concatenate(res).reshape([len(sample), -1])
    features = np.sum(matrix, axis=0)
    return features

In [7]:
# Load data and process to a fixed length
train_data_file = "../raw_data/train_demo.csv"
test_data_file = "../raw_data/test_demo.csv"
char_samples_train, labels_train = load_char_samples_and_labels(train_data_file, has_header=True, is_train=True)
char_samples_test, _ = load_char_samples_and_labels(test_data_file, has_header=True, is_train=False)

char_samples_train = preprocess(char_samples_train, sequence_length=SEQUENCE_LENGTH)
char_samples_test = preprocess(char_samples_test, sequence_length=SEQUENCE_LENGTH)

In [8]:
# Load the mapping from characters to its corresponding vectors
char_vectors_path = "../word_vectors/demo/demo-300d.txt"
char_to_vec_map = generate_char_mapping(char_vectors_path)

In [9]:
# Extract features
num_train = len(char_samples_train)
char_samples = char_samples_train + char_samples_test
feature_vectors = []
for char_sample in char_samples:
    feature_vector = generate_features(char_sample, char_to_vec_map)
    feature_vectors.append(feature_vector)

In [10]:
# Split data into training, validation and testing set
feature_vectors_train = feature_vectors[:num_train]
feature_vectors_test = feature_vectors[num_train:]

X = pd.DataFrame(feature_vectors_train, dtype=np.float32)
y = pd.Series(labels_train, dtype=np.int32) - 1
indices_shuffled = np.random.permutation(np.arange(num_train))
X_shuffled, y_shuffled = X.iloc[indices_shuffled], y.iloc[indices_shuffled]
X_train, X_val, y_train, y_val = train_test_split(X_shuffled, y_shuffled, train_size=0.8, random_state=42)
X_test = pd.DataFrame(feature_vectors_test, dtype=np.float32)

del char_samples_train, char_samples_test, char_samples, char_to_vec_map
del feature_vectors_train, feature_vectors_test, feature_vectors
del X, y, X_shuffled, y_shuffled
gc.collect()



54

In [11]:
# Set parameters
lgb_train = lgb.Dataset(X_train.values, y_train.values)
lgb_val = lgb.Dataset(X_val.values, y_val.values, reference=lgb_train)

num_classes = max(labels_train)
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': num_classes,
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'max_depth': 7,
    'learning_rate': 0.05, 
#     'feature_fraction': 0.9, 
#     'bagging_fraction': 0.8, 
#     'bagging_freq': 5, 
    'verbose': 0
}
num_boost_round = 500
feature_names = ['embed_' + str(col) for col in range(EMBEDDING_SIZE)]

In [12]:
# Train the LightGBM model
start_time = time.time()
gbm = lgb.train(params, 
                lgb_train, 
                num_boost_round=num_boost_round, 
                valid_sets=lgb_val, 
                feature_name=feature_names, 
                early_stopping_rounds=30)
print("Total seconds: %ds" % (time.time() - start_time))

[1]	valid_0's multi_logloss: 2.82129
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's multi_logloss: 2.72134
[3]	valid_0's multi_logloss: 2.64534
[4]	valid_0's multi_logloss: 2.57801
[5]	valid_0's multi_logloss: 2.51604
[6]	valid_0's multi_logloss: 2.46173
[7]	valid_0's multi_logloss: 2.41326
[8]	valid_0's multi_logloss: 2.36911
[9]	valid_0's multi_logloss: 2.32873
[10]	valid_0's multi_logloss: 2.29039
[11]	valid_0's multi_logloss: 2.2538
[12]	valid_0's multi_logloss: 2.22104
[13]	valid_0's multi_logloss: 2.19101
[14]	valid_0's multi_logloss: 2.16055
[15]	valid_0's multi_logloss: 2.13408
[16]	valid_0's multi_logloss: 2.10699
[17]	valid_0's multi_logloss: 2.08174
[18]	valid_0's multi_logloss: 2.05754
[19]	valid_0's multi_logloss: 2.03453
[20]	valid_0's multi_logloss: 2.01347
[21]	valid_0's multi_logloss: 1.99254
[22]	valid_0's multi_logloss: 1.97098
[23]	valid_0's multi_logloss: 1.95259
[24]	valid_0's multi_logloss: 1.93324
[25]	valid_0's multi_logloss: 1.9168

In [13]:
# Calculate the f1 score of validation set
probs_val = gbm.predict(X_val, num_iteration=gbm.best_iteration)
preds_val = np.argmax(probs_val, axis=1)
score_val = f1_score(y_val, preds_val, average='weighted')
print("The f1 score of validation set after %d epochs is: %f" % (num_boost_round, score_val))

The f1 score of validation set after 500 epochs is: 0.574371


In [14]:
# Save model
gbm.save_model("2018-07-15_lgb_300d.txt")

In [15]:
# Make submission
df_test = pd.read_csv(test_data_file)
submission = pd.DataFrame()
submission['id'] = df_test['id']
probs_test = gbm.predict(X_test, num_iteration=gbm.best_iteration)
preds_test = np.argmax(probs_test, axis=1) + 1
submission['class'] = preds_test
submission.to_csv("2018-07-15_submission.csv", index=False)