In [1]:
def lstm():
    VECTOR_DIR = 'wiki.zh.vector.bin'

    MAX_SEQUENCE_LENGTH = 100
    EMBEDDING_DIM = 200
    VALIDATION_SPLIT = 0.16
    TEST_SPLIT = 0.2

    print('(1) load texts...')
    train_texts = open('train_contents.txt', encoding="utf8").read().split('\n')
    train_labels = open('train_labels.txt').read().split('\n')
    test_texts = open('test_contents.txt', encoding="utf8").read().split('\n')
    test_labels = open('test_labels.txt').read().split('\n')
    all_texts = train_texts + test_texts
    all_labels = train_labels + test_labels

    print('(2) doc to var...')
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.utils import to_categorical
    import numpy as np

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_texts)
    sequences = tokenizer.texts_to_sequences(all_texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(np.asarray(all_labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    print('(3) split data set...')
    p1 = int(len(data) * (1 - VALIDATION_SPLIT - TEST_SPLIT))
    p2 = int(len(data) * (1 - TEST_SPLIT))
    x_train = data[:p1]
    y_train = labels[:p1]
    x_val = data[p1:p2]
    y_val = labels[p1:p2]
    x_test = data[p2:]
    y_test = labels[p2:]
    print('train docs: ' + str(len(x_train)))
    print('val docs: ' + str(len(x_val)))
    print('test docs: ' + str(len(x_test)))

    print('(5) training model...')
    from keras.layers import Dense, Input, Flatten, Dropout, GlobalAveragePooling1D
    from keras.layers import LSTM, Embedding
    from keras.models import Sequential

    model = Sequential()
    model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    # model.add(GlobalAveragePooling1D())
    model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.2))
    model.add(Dense(labels.shape[1], activation='softmax'))
    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    print(model.metrics_names)
    model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=6, batch_size=128)
    # model.save('lstm.h5')

    print('(6) testing model...')
    print(model.metrics_names)
    print(model.evaluate(x_test, y_test))

    print('(7) prediction ...')
    import pickle
    lstm_preds = model.predict(x_test)

    return  lstm_preds

def cnn():
    # coding:utf-8

    VECTOR_DIR = 'wiki.zh.vector.bin'

    MAX_SEQUENCE_LENGTH = 100
    EMBEDDING_DIM = 200
    VALIDATION_SPLIT = 0.16
    TEST_SPLIT = 0.2

    print('(1) load texts...')
    train_texts = open('train_contents.txt', encoding="utf8").read().split('\n')
    train_labels = open('train_labels.txt').read().split('\n')
    test_texts = open('test_contents.txt', encoding="utf8").read().split('\n')
    test_labels = open('test_labels.txt').read().split('\n')
    all_texts = train_texts + test_texts
    all_labels = train_labels + test_labels

    print('(2) doc to var...')
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.utils import to_categorical
    import numpy as np

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_texts)
    sequences = tokenizer.texts_to_sequences(all_texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(np.asarray(all_labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    print('(3) split data set...')
    p1 = int(len(data) * (1 - VALIDATION_SPLIT - TEST_SPLIT))
    p2 = int(len(data) * (1 - TEST_SPLIT))
    x_train = data[:p1]
    y_train = labels[:p1]
    x_val = data[p1:p2]
    y_val = labels[p1:p2]
    x_test = data[p2:]
    y_test = labels[p2:]
    print('train docs: ' + str(len(x_train)))
    print('val docs: ' + str(len(x_val)))
    print('test docs: ' + str(len(x_test)))

    print('(5) training model...')
    from keras.layers import Dense, Input, Flatten, Dropout
    from keras.layers import Conv1D, MaxPooling1D, Embedding
    from keras.models import Sequential

    model = Sequential()
    model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    model.add(Dropout(0.2))
    model.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
    model.add(MaxPooling1D(3))
    model.add(Flatten())
    model.add(Dense(EMBEDDING_DIM, activation='relu'))
    model.add(Dense(labels.shape[1], activation='softmax'))
    model.summary()
    # plot_model(model, to_file='model.png',show_shapes=True)

    model.compile(loss='categorical_crossentropy',
                  # optimizer='rmsprop',
                  optimizer='adam',
                  metrics=['acc'])
    print(model.metrics_names)
    model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=4, batch_size=128)
    # model.save('cnn.h5')

    print('(6) testing model...')
    print(model.metrics_names)
    print(model.evaluate(x_test, y_test))

    cnn_preds = model.predict(x_test)
    return cnn_preds

def pre_lstm():
    # coding:utf-8

    VECTOR_DIR = 'wiki.zh.vector.bin'

    MAX_SEQUENCE_LENGTH = 100
    EMBEDDING_DIM = 128
    VALIDATION_SPLIT = 0.16
    TEST_SPLIT = 0.2

    print('(1) load texts...')
    train_texts = open('train_contents.txt', encoding="utf8").read().split('\n')
    train_labels = open('train_labels.txt').read().split('\n')
    test_texts = open('test_contents.txt', encoding="utf8").read().split('\n')
    test_labels = open('test_labels.txt').read().split('\n')
    all_texts = train_texts + test_texts
    all_labels = train_labels + test_labels

    print('(2) doc to var...')
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.utils import to_categorical
    import numpy as np

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_texts)
    sequences = tokenizer.texts_to_sequences(all_texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(np.asarray(all_labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    print('(3) split data set...')
    p1 = int(len(data) * (1 - VALIDATION_SPLIT - TEST_SPLIT))
    p2 = int(len(data) * (1 - TEST_SPLIT))
    x_train = data[:p1]
    y_train = labels[:p1]
    x_val = data[p1:p2]
    y_val = labels[p1:p2]
    x_test = data[p2:]
    y_test = labels[p2:]
    print('train docs: ' + str(len(x_train)))
    print('val docs: ' + str(len(x_val)))
    print('test docs: ' + str(len(x_test)))

    print('(4) load word2vec as embedding...')
    import gensim
    from keras.utils import plot_model
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=True)
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    not_in_model = 0
    in_model = 0
    for word, i in word_index.items():
        if str(word) in w2v_model:
            in_model += 1
            embedding_matrix[i] = np.asarray(w2v_model[str(word)], dtype='float32')
        else:
            not_in_model += 1
    print(str(not_in_model) + ' words not in w2v model')
    from keras.layers import Embedding
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

    print('(5) training model...')
    from keras.layers import Dense, Input, Flatten, Dropout
    from keras.layers import LSTM, Embedding
    from keras.models import Sequential

    model = Sequential()
    model.add(embedding_layer)
    model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.2))
    model.add(Dense(labels.shape[1], activation='softmax'))
    model.summary()
    # plot_model(model, to_file='model.png',show_shapes=True)

    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    print(model.metrics_names)
    model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=6, batch_size=128)
    # model.save('word_vector_lstm.h5')

    print('(6) testing model...')
    print(model.metrics_names)
    print(model.evaluate(x_test, y_test))

    pre_lstm_preds = model.predict(x_test)

    return pre_lstm_preds

def pre_cnn():
    VECTOR_DIR = 'wiki.zh.vector.bin'

    MAX_SEQUENCE_LENGTH = 100
    EMBEDDING_DIM = 128
    VALIDATION_SPLIT = 0.16
    TEST_SPLIT = 0.2

    print('(1) load texts...')
    train_texts = open('train_contents.txt', encoding="utf8").read().split('\n')
    train_labels = open('train_labels.txt').read().split('\n')
    test_texts = open('test_contents.txt', encoding="utf8").read().split('\n')
    test_labels = open('test_labels.txt').read().split('\n')
    all_texts = train_texts + test_texts
    all_labels = train_labels + test_labels

    print('(2) doc to var...')
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.utils import to_categorical
    import numpy as np

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_texts)
    sequences = tokenizer.texts_to_sequences(all_texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(np.asarray(all_labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    print('(3) split data set...')
    # split the data into training set, validation set, and test set
    p1 = int(len(data) * (1 - VALIDATION_SPLIT - TEST_SPLIT))
    p2 = int(len(data) * (1 - TEST_SPLIT))
    x_train = data[:p1]
    y_train = labels[:p1]
    x_val = data[p1:p2]
    y_val = labels[p1:p2]
    x_test = data[p2:]
    y_test = labels[p2:]
    print('train docs: ' + str(len(x_train)))
    print('val docs: ' + str(len(x_val)))
    print('test docs: ' + str(len(x_test)))

    print('(4) load word2vec as embedding...')
    import gensim
    from keras.utils import plot_model
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=True)
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    not_in_model = 0
    in_model = 0
    for word, i in word_index.items():
        if str(word) in w2v_model:
            in_model += 1
            embedding_matrix[i] = np.asarray(w2v_model[str(word)], dtype='float32')
        else:
            not_in_model += 1
    print(str(not_in_model) + ' words not in w2v model')
    from keras.layers import Embedding
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

    print('(5) training model...')
    from keras.layers import Dense, Input, Flatten, Dropout
    from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalMaxPooling1D
    from keras.models import Sequential

    model = Sequential()
    model.add(embedding_layer)
    model.add(Dropout(0.2))
    model.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
    model.add(MaxPooling1D(3))
    model.add(Flatten())
    model.add(Dense(EMBEDDING_DIM, activation='relu'))
    model.add(Dense(labels.shape[1], activation='softmax'))
    model.summary()
    # plot_model(model, to_file='model.png',show_shapes=True)

    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    print(model.metrics_names)
    model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=4, batch_size=128)
    # model.save('word_vector_cnn.h5')

    print('(6) testing model...')
    print(model.metrics_names)
    print(model.evaluate(x_test, y_test))

    print('(7) prediction file...')
    import pickle
    pre_cnn_preds = model.predict(x_test)

    return pre_cnn_preds

def mlp():
    # coding:utf-8

    VECTOR_DIR = 'wiki.zh.vector.bin'

    MAX_SEQUENCE_LENGTH = 100
    EMBEDDING_DIM = 200
    VALIDATION_SPLIT = 0.16
    TEST_SPLIT = 0.2

    print('(1) load texts...')
    train_texts = open('train_contents.txt', encoding="utf8").read().split('\n')
    train_labels = open('train_labels.txt').read().split('\n')
    test_texts = open('test_contents.txt', encoding="utf8").read().split('\n')
    test_labels = open('test_labels.txt').read().split('\n')
    all_texts = train_texts + test_texts
    all_labels = train_labels + test_labels

    print('(2) doc to var...')
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.utils import to_categorical
    import numpy as np

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_texts)
    sequences = tokenizer.texts_to_sequences(all_texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = tokenizer.sequences_to_matrix(sequences, mode='tfidf')
    labels = to_categorical(np.asarray(all_labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    print('(3) split data set...')
    p1 = int(len(data) * (1 - VALIDATION_SPLIT - TEST_SPLIT))
    p2 = int(len(data) * (1 - TEST_SPLIT))
    x_train = data[:p1]
    y_train = labels[:p1]
    x_val = data[p1:p2]
    y_val = labels[p1:p2]
    x_test = data[p2:]
    y_test = labels[p2:]
    print('train docs: ' + str(len(x_train)))
    print('val docs: ' + str(len(x_val)))
    print('test docs: ' + str(len(x_test)))

    print('(5) training model...')
    from keras.layers import Dense, Input, Flatten, Dropout
    from keras.layers import LSTM, Embedding
    from keras.models import Sequential
    from keras.utils import plot_model

    model = Sequential()
    model.add(Dense(256, input_shape=(len(word_index) + 1,), activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(labels.shape[1], activation='softmax'))
    model.summary()
    # plot_model(model, to_file='model.png',show_shapes=True)

    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    print(model.metrics_names)
    model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=4, batch_size=128)
    # model.save('mlp.h5')

    print('(6) testing model...')
    print(model.metrics_names)
    print(model.evaluate(x_test, y_test))
    mlp_preds = model.predict(x_test)

    return mlp_preds
def bys():
    # coding:utf-8

    VECTOR_DIR = 'wiki.zh.vector.bin'

    MAX_SEQUENCE_LENGTH = 100
    EMBEDDING_DIM = 200
    TEST_SPLIT = 0.2

    print('(1) load texts...')
    train_texts = open('train_contents.txt', encoding="utf8").read().split('\n')
    train_labels = open('train_labels.txt').read().split('\n')
    test_texts = open('test_contents.txt', encoding="utf8").read().split('\n')
    test_labels = open('test_labels.txt').read().split('\n')
    all_text = train_texts + test_texts

    print('(2) doc to var...')
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    count_v0 = CountVectorizer()
    counts_all = count_v0.fit_transform(all_text)
    count_v1 = CountVectorizer(vocabulary=count_v0.vocabulary_)
    counts_train = count_v1.fit_transform(train_texts)
    print("the shape of train is " + repr(counts_train.shape))
    count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_)
    counts_test = count_v2.fit_transform(test_texts)
    print("the shape of test is " + repr(counts_test.shape))

    tfidftransformer = TfidfTransformer()
    train_data = tfidftransformer.fit(counts_train).transform(counts_train)
    test_data = tfidftransformer.fit(counts_test).transform(counts_test)

    x_train = train_data
    y_train = train_labels
    x_test = test_data
    y_test = test_labels

    print('(3) Naive Bayes...')
    from sklearn.naive_bayes import MultinomialNB
    from sklearn import metrics
    clf = MultinomialNB(alpha=1)
    clf.fit(x_train, y_train)

    # preds = clf.predict(x_test)
    # preds = preds.tolist()

    test_acc = clf.score(x_test, y_test)
    train_acc = clf.score(x_train, y_train)
    print('test acc:{}'.format(test_acc))
    print('train acc:{}'.format(train_acc))

    bys_preds = clf.predict(x_test)
    return bys_preds
def svm():
    # coding:utf-8

    VECTOR_DIR = 'wiki.zh.vector.bin'

    MAX_SEQUENCE_LENGTH = 100
    EMBEDDING_DIM = 200
    TEST_SPLIT = 0.2

    print('(1) load texts...')
    train_texts = open('train_contents.txt', encoding="utf8").read().split('\n')
    train_labels = open('train_labels.txt').read().split('\n')
    test_texts = open('test_contents.txt', encoding="utf8").read().split('\n')
    test_labels = open('test_labels.txt').read().split('\n')
    all_text = train_texts + test_texts

    print('(2) doc to var...')
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    count_v0 = CountVectorizer();
    counts_all = count_v0.fit_transform(all_text);
    count_v1 = CountVectorizer(vocabulary=count_v0.vocabulary_);
    counts_train = count_v1.fit_transform(train_texts);
    print("the shape of train is " + repr(counts_train.shape))
    count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_);
    counts_test = count_v2.fit_transform(test_texts);
    print("the shape of test is " + repr(counts_test.shape))

    tfidftransformer = TfidfTransformer();
    train_data = tfidftransformer.fit(counts_train).transform(counts_train);
    test_data = tfidftransformer.fit(counts_test).transform(counts_test);

    x_train = train_data
    y_train = train_labels
    x_test = test_data
    y_test = test_labels

    print('(3) SVM...')
    from sklearn.svm import SVC
    # svclf = SVC(c=1.0,kernel = 'linear')
    # kernels = ['linear','poly','rbf','sigmoid']
    kernels = ['linear']

    for kernel in kernels:
        print('-' * 20 + kernel)
        svclf = SVC(C=1.0, kernel=kernel)
        svclf.fit(x_train, y_train)

        test_acc = svclf.score(x_test, y_test)
        train_acc = svclf.score(x_train, y_train)
        print('test acc:{}'.format(test_acc))
        print('train acc:{}'.format(train_acc))

        svm_preds = svclf.predict(x_test)
    return svm_preds


lstm_preds = lstm()
cnn_preds = cnn()
pre_lstm_preds = pre_lstm()
pre_cnn_preds = pre_cnn()
mlp_preds = mlp()
bys_preds = bys()
svm_preds = svm()

(1) load texts...
(2) doc to var...


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Found 65606 unique tokens.
Shape of data tensor: (21924, 100)
Shape of label tensor: (21924, 12)
(3) split data set...
train docs: 14031
val docs: 3508
test docs: 4385
(5) training model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 200)          13121400  
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 12)                2412      
Total params: 13,444,612
Trainable params: 13,444,612
Non-trainable params: 0
_________________________________________________________________
['loss', 'acc']
Train on 14031 samples, validate on 3508 samples
Epoch 1



9213 words not in w2v model
(5) training model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 128)          8397696   
_________________________________________________________________
lstm_2 (LSTM)                (None, 200)               263200    
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 12)                2412      
Total params: 8,663,308
Trainable params: 265,612
Non-trainable params: 8,397,696
_________________________________________________________________
['loss', 'acc']
Train on 14031 samples, validate on 3508 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
(6) testing model...
['loss', 'acc']
[0.531274515900128, 0.8513112884834664]
(1) lo

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(3) Naive Bayes...
test acc:0.8501387604070305
train acc:0.9561363636363637
(1) load texts...
(2) doc to var...
the shape of train is (17600, 62418)
the shape of test is (4324, 62418)
(3) SVM...
--------------------linear
test acc:0.8443570767807586
train acc:0.9908522727272727


In [53]:
import pandas as pd
all_pred1 = pd.DataFrame([list(lstm_preds1),list(cnn_preds1),list(pre_lstm_preds1),list(pre_cnn_preds1),list(mlp_preds1),list(bys_preds),list(svm_preds)])
# pred = all_pred.mode(axis=1).values()

In [None]:
lstm_preds = lstm()
cnn_preds = cnn()
pre_lstm_preds = pre_lstm()
pre_cnn_preds = pre_cnn()
mlp_preds = mlp()
bys_preds = bys()
svm_preds = svm()

In [32]:
lstm_preds1 = list(map(lambda x:x.argmax(),lstm_preds))  
cnn_preds1 = list(map(lambda x:x.argmax(),cnn_preds))  
pre_lstm_preds1 = list(map(lambda x:x.argmax(),pre_lstm_preds))  
pre_cnn_preds1 = list(map(lambda x:x.argmax(),pre_cnn_preds))  
mlp_preds1 =list(map(lambda x:x.argmax(),mlp_preds))  

In [5]:
lstm_preds.shape

(4385, 12)

In [7]:
lstm_preds[0]

array([0.00114512, 0.03445483, 0.00200022, 0.00404092, 0.00741396,
       0.01502563, 0.01248523, 0.00431627, 0.00325661, 0.00684551,
       0.06282775, 0.84618795], dtype=float32)

In [36]:
len(bys_preds)

4324

In [38]:
all_pred.drop([5,6],inplace=True)

In [45]:
prediction = all_pred.mode(axis=0).iloc[0]

TypeError: 'numpy.ndarray' object is not callable

In [40]:
all_pred

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4375,4376,4377,4378,4379,4380,4381,4382,4383,4384
0,11,4,8,10,9,9,10,6,4,2,...,4.0,2.0,8.0,9.0,5.0,10.0,11.0,10.0,1.0,3.0
1,1,4,8,10,9,9,5,6,4,2,...,4.0,2.0,8.0,9.0,5.0,11.0,11.0,10.0,1.0,3.0
2,9,4,8,10,9,9,11,6,4,2,...,4.0,2.0,8.0,9.0,5.0,9.0,6.0,10.0,1.0,5.0
3,9,4,8,10,9,9,11,6,4,2,...,4.0,2.0,8.0,9.0,5.0,10.0,6.0,10.0,1.0,5.0
4,9,4,8,10,9,9,8,6,4,2,...,4.0,2.0,8.0,9.0,5.0,10.0,6.0,10.0,1.0,2.0


In [43]:
test_labels = open('test_labels.txt').read().split('\n')

In [50]:
len(test_labels)

4324

In [65]:
all_pred1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4375,4376,4377,4378,4379,4380,4381,4382,4383,4384
0,11,4,8,10,9,9,10,6,4,2,...,4,2,8,9,5,10,11,10,1,3
1,1,4,8,10,9,9,5,6,4,2,...,4,2,8,9,5,11,11,10,1,3
2,9,4,8,10,9,9,11,6,4,2,...,4,2,8,9,5,9,6,10,1,5
3,9,4,8,10,9,9,11,6,4,2,...,4,2,8,9,5,10,6,10,1,5
4,9,4,8,10,9,9,8,6,4,2,...,4,2,8,9,5,10,6,10,1,2
5,0,0,0,0,0,0,0,0,0,0,...,4,2,8,9,5,10,6,10,1,5
6,0,0,0,0,0,0,0,0,0,0,...,4,2,8,9,5,9,11,10,1,9


In [63]:
bys_preds1=([0]*61) + list(bys_preds)
svm_preds1=([0]*61) + list(svm_preds)

In [64]:
all_pred1 = pd.DataFrame([list(lstm_preds1),list(cnn_preds1),list(pre_lstm_preds1),list(pre_cnn_preds1),list(mlp_preds1),list(bys_preds1),list(svm_preds1)])

In [60]:
svm_preds1

In [62]:
([0]*61) + list(bys_preds)

4385

In [70]:
prediction = all_pred1.iloc[:,61:].mode().iloc[0,:].values

  warn("Unable to sort modes: %s" % e)
  warn("Unable to sort modes: %s" % e)


In [71]:
len(prediction)

4324

In [73]:
prediction

array([2.0, 6.0, 2.0, ..., 10.0, 1.0, 3.0], dtype=object)

In [79]:
labels = np.array(list(map(lambda x: int(x),test_labels)))

In [80]:
labels

array([ 2,  6,  1, ..., 10,  1,  2])

In [89]:
((labels - prediction)==0).sum()

3747

In [85]:
prediction = prediction.astype('int')

In [90]:
3747/4324

0.866558741905643