In [1]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

try:
    physical_devices = tf.config.experimental.list_physical_devices('GPU')
    assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    pass

p_config = {'batch_size': 128, 'vocab_size': 8000,
            'embedding_dim': 1024, 'epochs': 2, 'units': 256,
            'max_inp': 1200, 'train_data': '/home/peihongyue/data/tianchi_nlp/train_set.csv',
            'test_data': '/home/peihongyue/data/tianchi_nlp/test_a_sample_submit.csv',
            'model_data': '/home/peihongyue/data/tianchi_nlp/model/'}

In [2]:
def load_data(path):
    y_array = []
    x_array = []
    with open(path) as f:
        f.readline()
        for line in f:
            line = line.split('\t')
            y_array.append(int(line[0]))
            x_array.append([int(i) for i in line[1].split(' ')])
    x_array = tf.keras.preprocessing.sequence.pad_sequences(x_array, maxlen=p_config['max_inp'], padding='post')
    return x_array, np.array(y_array)

In [3]:
def build_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(1200,)),
        tf.keras.layers.Embedding(8000, 1024),
        tf.keras.layers.LSTM(256),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(14, activation='softmax')
    ])
    model.summary()
    return model

In [4]:
def train(model, x_train, y_train, x_test, y_test):
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_test, y_test), callbacks=[callback], class_weight=class_weight)
    return model

In [5]:
def load_test(path):
    x_array = []
    with open(path) as f:
        f.readline()
        for line in f:
            x_array.append([int(i) for i in line.split(' ')])
    x_array = tf.keras.preprocessing.sequence.pad_sequences(x_array, maxlen=p_config['max_inp'], padding='post')
    return x_array

In [6]:
train_data='/home/peihongyue/data/tianchi_nlp/train_set.csv'

x_array, y_array = load_data(train_data)
print(x_array.shape)
print(y_array.shape)

callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)
class_weight = {0: 38918, 1: 36945, 2: 31425, 3: 22133, 4: 15016, 5: 12232, 6: 9985, 7: 8841, 8: 7847, 9: 5878,
                10: 4920, 11: 3131, 12: 1821, 13: 908}
c_sum = sum(class_weight.values())
class_weight = {key: (1 / val) * (c_sum) / 2.0 for key, val in class_weight.items()}

x_train, x_test, y_train, y_test = train_test_split(x_array, y_array, test_size=0.3)
model = build_model()

(200000, 1200)
(200000,)
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1200, 1024)        8192000   
_________________________________________________________________
lstm (LSTM)                  (None, 256)               1311744   
_________________________________________________________________
dense (Dense)                (None, 128)               32896     
_________________________________________________________________
dense_1 (Dense)              (None, 14)                1806      
Total params: 9,538,446
Trainable params: 9,538,446
Non-trainable params: 0
_________________________________________________________________


In [7]:
model = train(model, x_train, y_train, x_test, y_test)

y_pred = model.predict(x_test)
y_pred = tf.argmax(y_pred, axis=1).numpy()
print(y_pred)
print(y_test)

print(f1_score(y_pred, y_test, average='macro'))

W0802 17:39:13.051182 140636824545024 data_adapter.py:1091] sample_weight modes were coerced from
  ...
    to  
  ['...']
W0802 17:39:13.294522 140636824545024 data_adapter.py:1091] sample_weight modes were coerced from
  ...
    to  
  ['...']


Train on 140000 samples, validate on 60000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
[1 4 0 ... 2 4 0]
[1 4 0 ... 2 3 0]
0.9023111959019949


In [8]:
test_data='/home/peihongyue/data/tianchi_nlp/test_a.csv'
test_x = load_test(test_data)
y_pred = model.predict(test_x)
y_pred = tf.argmax(y_pred, axis=1).numpy()

In [9]:
print(y_pred)

[1 2 8 ... 1 3 1]


In [10]:
with open('/home/peihongyue/data/tianchi_nlp/ans1.csv', 'w') as f:
    f.write('label' + '\n')
    for y in y_pred:
        f.write(str(y) + '\n')

In [11]:
test_x[:10]

array([[5399, 3117, 1070, ...,    0,    0,    0],
       [6819,  648, 3523, ..., 3215, 5791, 2662],
       [2673, 5076, 6835, ...,    0,    0,    0],
       ...,
       [3770, 2461, 2151, ...,    0,    0,    0],
       [6235, 6248, 5620, ...,    0,    0,    0],
       [1141, 4411, 6902, ...,    0,    0,    0]], dtype=int32)