Loading dataset...
Training set shape: (11391, 3)
Loading finished.
Processing dataset...


In [1]:
from config import config
from dataset import CustomDataset, process_data, train_test_split
from model import create_model_rnn, create_model_cnn

import os
import time
import tensorflow as tf
import numpy as np
import pandas as pd


os.environ['CUDA_VISIBLE_DEVICES'] = config.CUDA_VISIBLE_DEVICES    # specify GPU usage   

In [2]:
    ## Loading data
    print('Loading dataset...')
    if config.ALREADY_SPLIT:
        train_df = pd.read_csv(config.TRAIN_FILE) 
        val_df = pd.read_csv(config.VALIDATION_FILE)    
        print('Training set shape: '+ str(train_df.shape))
        print('Validaiton set shape: '+ str(val_df.shape))
        print('Loading finished.')
    else:
        data_df = process_data(config.INPUT_FILE, config.CLS2IDX, True)     # DataFrame, only used labeled data
        train_df, test_df = train_test_split(
            data_df, 
            test_size=config.TEST_SIZE, 
            shuffle=True, 
            random_state=config.RANDOM_STATE)
        train_df, val_df = train_test_split(
            train_df, 
            test_size=config.VALIDATION_SIZE, 
            shuffle=True, 
            random_state=config.RANDOM_STATE)  
        print('Training set shape: '+ str(train_df.shape))
        print('Validaiton set shape: '+ str(val_df.shape))
        print('Test set shape: '+ str(test_df.shape))
        print('Loading finished.')
        print('Saving training set & validation set & test set to local...')
        train_df.to_csv(config.TRAIN_FILE, index=False)
        val_df.to_csv(config.VALIDATION_FILE, index=False)
        test_df.to_csv(config.TEST_FILE, index=False)
        print('Saving finished.')

Loading dataset...
Training set shape: (11391, 3)
Validaiton set shape: (2847, 3)
Test set shape: (2512, 3)
Loading finished.
Saving training set & validation set & test set to local...
Saving finished.


In [3]:
## Processing data
print('Processing dataset...')
t1 = time.time()
train_set = CustomDataset(
    train_df[config.CONTENT_FIELD], 
    train_df[config.LABEL_FIELD], 
)
val_set = CustomDataset(
    val_df[config.CONTENT_FIELD], 
    val_df[config.LABEL_FIELD], 
)
t2 = time.time()
print('Processing finished, time consumption in ' + str(t2-t1) + 's.')


Processing dataset...
Processing finished, time consumption in 130.7117145061493s.


In [4]:
train_set[0]

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.681 seconds.
Prefix dict has been built successfully.


AttributeError: 'int' object has no attribute 'decode'

In [None]:
from config import config
import gensim


word2vec = gensim.models.KeyedVectors.load_word2vec_format(config.WORD2VEC_FILE, binary=False)
word2vec.add_vector('<PAD>', (np.random.rand(len(word2vec[0])) - 0.5) * 2)
word2vec.add_vector('<OOV>', (np.random.rand(len(word2vec[0])) - 0.5) * 2)

def segmentation(x):
    seg_list = jieba.cut(x)
    token_id_list = [word2vec.key_to_index[x] if x in word2vec.key_to_index.keys() else word2vec.key_to_index['<OOV>'] for x in seg_list] 
    return token_id_list




In [15]:
sents = pd.read_csv(config.TRAIN_FILE)['content'].values.astype('str')
sents = list(map(lambda x: segmentation(x), sents))

In [16]:
sents

[[32624,
  5,
  12,
  400,
  1,
  0,
  1019,
  60,
  63,
  19492,
  67,
  0,
  64,
  259870,
  1216,
  0,
  13,
  5534,
  2,
  32624,
  6,
  29761,
  5,
  79,
  4295,
  14,
  128,
  1798,
  976,
  1874,
  1,
  618,
  101,
  17515,
  185,
  1,
  194,
  0,
  65,
  7194,
  19390,
  1,
  526,
  36,
  14,
  259870,
  0,
  16268,
  164,
  22,
  30326,
  2],
 [259870,
  485,
  0,
  135,
  40,
  4331,
  7,
  259870,
  2635,
  10,
  453,
  107,
  63,
  1105,
  30,
  4348,
  1,
  21,
  0,
  799,
  259870,
  2196,
  128957,
  42,
  80,
  3,
  321],
 [45,
  5347,
  2721,
  1,
  54761,
  152,
  321,
  25052,
  124,
  6190,
  1304,
  259870,
  121,
  0,
  2083,
  3804,
  1829,
  23475,
  5,
  393,
  1792,
  2120,
  1,
  1889,
  0,
  721,
  1254,
  4746,
  1303,
  1078,
  33407,
  33407,
  33905,
  1,
  403,
  0,
  8,
  9,
  17,
  6241,
  1,
  5993,
  2],
 [259870,
  3,
  7,
  387,
  0,
  1715,
  3447,
  0,
  255,
  739,
  44,
  149,
  7,
  110855,
  1,
  7537,
  0,
  350,
  27,
  128,
  4365,
  0,
 

In [14]:
np.array(sents)

  """Entry point for launching an IPython kernel.


array([list([32624, 5, 12, 400, 1, 0, 1019, 60, 63, 19492, 67, 0, 64, 259870, 1216, 0, 13, 5534, 2, 32624, 6, 29761, 5, 79, 4295, 14, 128, 1798, 976, 1874, 1, 618, 101, 17515, 185, 1, 194, 0, 65, 7194, 19390, 1, 526, 36, 14, 259870, 0, 16268, 164, 22, 30326, 2]),
       list([259870, 485, 0, 135, 40, 4331, 7, 259870, 2635, 10, 453, 107, 63, 1105, 30, 4348, 1, 21, 0, 799, 259870, 2196, 128957, 42, 80, 3, 321]),
       list([45, 5347, 2721, 1, 54761, 152, 321, 25052, 124, 6190, 1304, 259870, 121, 0, 2083, 3804, 1829, 23475, 5, 393, 1792, 2120, 1, 1889, 0, 721, 1254, 4746, 1303, 1078, 33407, 33407, 33905, 1, 403, 0, 8, 9, 17, 6241, 1, 5993, 2]),
       ...,
       list([556, 2491, 1, 2821, 32879, 54, 154, 4768, 7, 18, 0, 252, 154, 4670, 7, 18, 2, 82805, 1, 2607, 2067, 5, 201, 9, 12542, 4, 5867, 1, 2, 8, 94, 8, 124, 841, 529, 11702, 1, 18, 10130, 1184, 0, 83, 8, 15115, 16383, 2, 2402, 60, 964, 12, 57, 0, 72, 104, 8, 1, 38130, 0, 8, 31, 1277, 70, 60, 37, 841, 341, 2, 15552, 5520, 1, 18, 581