In [40]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import os
os.environ['KERAS_BACKEND']='theano' # Why theano why not
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

plt.switch_backend('agg')
%matplotlib inline

In [4]:
# !pip install theano

Collecting theano
[?25l  Downloading https://files.pythonhosted.org/packages/7d/c4/6341148ad458b6cd8361b774d7ee6895c38eab88f05331f22304c484ed5d/Theano-1.0.4.tar.gz (2.8MB)
[K     |████████████████████████████████| 2.8MB 14.5MB/s eta 0:00:01
Building wheels for collected packages: theano
  Building wheel for theano (setup.py) ... [?25ldone
[?25h  Created wheel for theano: filename=Theano-1.0.4-cp36-none-any.whl size=2667179 sha256=a840331a2e4da34a7a466826df6083d8d9024e747e2627b34c51862f3faa5c51
  Stored in directory: /root/.cache/pip/wheels/88/fb/be/483910ff7e9f703f30a10605ad7605f3316493875c86637014
Successfully built theano
Installing collected packages: theano
Successfully installed theano-1.0.4
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [29]:
!ls

lost+found  test.tsv  train.tsv


In [27]:
cd storage/

[Errno 2] No such file or directory: 'storage/'
/storage


In [6]:
def clean_str(string):
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

In [17]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [9]:
df = pd.read_csv('train.tsv', sep='\t', header=0)

In [10]:
df1 = pd.read_csv('test.tsv', sep='\t', header=0)

In [11]:
macronum=sorted(set(df['label']))
macro_to_id = dict((note, number) for number, note in enumerate(macronum))

def fun(i):
    return macro_to_id[i]

df['label']=df['label'].apply(fun)

In [15]:
texts = []
labels = []


for idx in range(df['text'].shape[0]):
    text = BeautifulSoup(df['text'][idx])
    texts.append(clean_str(str(text.get_text().encode())))

for idx in df['label']:
    labels.append(idx)

In [18]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Number of Unique Tokens',len(word_index))

Number of Unique Tokens 169462


In [19]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of Data Tensor:', data.shape)
print('Shape of Label Tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:20000]
y_train = labels[:20000]
x_val = data[:20000]
y_val = labels[:20000]

Shape of Data Tensor: (416768, 1000)
Shape of Label Tensor: (416768, 2)


In [45]:
x_train, x_test, y_train, y_test = train_test_split(data[:40000], labels[:40000], test_size=0.2, random_state=7)

In [49]:
y_train

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]], dtype=float32)

Total 400000 word vectors in Glove 6B 100d.


In [31]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,trainable=True)

In [51]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(5)(l_cov2)
# l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
# l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
# l_flat = Flatten()(l_pool3)
l_flat = Flatten()(l_pool2)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(len(macronum), activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Simplified convolutional neural network")
model.summary()
cp=ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)

Simplified convolutional neural network
Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         16946300  
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 199, 128)          0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 39, 128)           0         
_________________________________________________________________
flatten_5 (Flatten)

In [52]:
history=model.fit(x_train, y_train, validation_split=0.2,epochs=15, batch_size=30,callbacks=[cp])

Train on 25600 samples, validate on 6400 samples
Epoch 1/15

Epoch 00001: val_acc improved from -inf to 0.66859, saving model to model_cnn.hdf5
Epoch 2/15

Epoch 00002: val_acc did not improve from 0.66859
Epoch 3/15

Epoch 00003: val_acc improved from 0.66859 to 0.67172, saving model to model_cnn.hdf5
Epoch 4/15

Epoch 00004: val_acc improved from 0.67172 to 0.68187, saving model to model_cnn.hdf5
Epoch 5/15

Epoch 00005: val_acc did not improve from 0.68187
Epoch 6/15
  690/25600 [..............................] - ETA: 12:13 - loss: 0.2401 - acc: 0.8942

KeyboardInterrupt: 

In [53]:
fig1 = plt.figure()
plt.plot(history.history['loss'],'r',linewidth=3.0)
plt.plot(history.history['val_loss'],'b',linewidth=3.0)
plt.legend(['Training loss', 'Validation Loss'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Loss',fontsize=16)
plt.title('Loss Curves :CNN',fontsize=16)
fig1.savefig('loss_cnn.png')
plt.show()

NameError: name 'history' is not defined

<Figure size 432x288 with 0 Axes>

In [None]:
fig2=plt.figure()
plt.plot(history.history['acc'],'r',linewidth=3.0)
plt.plot(history.history['val_acc'],'b',linewidth=3.0)
plt.legend(['Training Accuracy', 'Validation Accuracy'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Accuracy',fontsize=16)
plt.title('Accuracy Curves : CNN',fontsize=16)
fig2.savefig('accuracy_cnn.png')
plt.show()