In [1]:
import pandas as pd
from os.path import join
from sklearn.utils import shuffle

emotions = ["happy", "sad", ["disgust", "disgust2"], "angry", "fear", "surprise"]
dir_path = "gathering/ja_tweets_sentiment"
size = 60000
df = []
for i, es in enumerate(emotions):
    if isinstance(es, list):
        for e in es:
            data = shuffle(pd.read_json(join(dir_path, "{}.json".format(e)))).iloc[:int(size/len(es))]
            data['label'] = i
            df.append(data)
    else:
        data = shuffle(pd.read_json(join(dir_path, "{}.json".format(es)))).iloc[:int(size)]
        data['label'] = i
        df.append(data)

df = pd.concat(df)
df.shape

(360000, 11)

In [2]:
df = shuffle(df)

In [3]:
X = df['text']
y = df['label']

In [4]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/b0/65/98eb38dfa92c4a1414570db03a4b1eb6cf79f35d0d86da8fae117d56d4e3/sentencepiece-0.1.4-cp36-cp36m-manylinux1_x86_64.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 3.9MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.4


In [5]:
import sentencepiece as spm

In [6]:
sp = spm.SentencePieceProcessor()
sp.Load("twitterstream2word2vec/model/sp/sp.model")

True

In [8]:
import re
regexs = []
regexs.append(re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'))
regexs.append(re.compile('@(\w){1,15}'))

def tokenize(data, regexs, sp=sp):
    results = []
    for d in data:
        try:
            for regex in regexs:
                d = re.sub(regex, "", d)
            d = ' '.join([l.replace("▁", "").replace("#","") for l in sp.EncodeAsPieces(d)])
        except:
            d = ""
        results.append(d)
    return results

In [10]:
X = tokenize(X, regexs, sp)

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

max_features=32000
maxlen = 280

y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y)

tokenizer = Tokenizer(num_words=max_features, filters="", lower=False)
tokenizer.fit_on_texts(list(X_train))

def preprocess(data, tokenizer, maxlen=280):
    return(pad_sequences(tokenizer.texts_to_sequences(data), maxlen=maxlen))

X_train = preprocess(X_train, tokenizer)
X_test = preprocess(X_test, tokenizer)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [21]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)

In [12]:
word_index = tokenizer.word_index

In [13]:
from gensim.models import word2vec

In [14]:
word_vectors = word2vec.Word2Vec.load("twitterstream2word2vec/model/w2v_gensim/word2vec_tweet.model")

In [16]:
import numpy as np

EMBEDDING_DIM = 200
vocabulary_size = min(len(word_index)+1, max_features)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= max_features:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)
        
del(word_vectors)

from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)

  # This is added back by InteractiveShellApp.init_path()


In [17]:
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers

sequence_length = X_train.shape[1]
filter_sizes = [3,4,5]
num_filters = 100
drop = 0.5

inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)
reshape = Reshape((sequence_length, EMBEDDING_DIM, 1))(embedding)

conv_0 = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_1 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_2 = Conv2D(num_filters, (filter_sizes[2], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

maxpool_0 = MaxPooling2D((sequence_length - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0)
maxpool_1 = MaxPooling2D((sequence_length - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1)
maxpool_2 = MaxPooling2D((sequence_length - filter_sizes[2] + 1, 1), strides=(1,1))(conv_2)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((3*num_filters,))(flatten)
dropout = Dropout(drop)(flatten)
output = Dense(units=6, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

# this creates a model that includes
model = Model(inputs, output)

In [18]:
adam = Adam(lr=1e-3)

model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])

In [19]:
callbacks = [EarlyStopping(monitor='val_loss')]

In [22]:
model.fit(X_train, y_train, batch_size=1000, epochs=10, verbose=1, validation_data=(X_val, y_val), callbacks=callbacks)

Train on 202500 samples, validate on 67500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.callbacks.History at 0x7f4f8b2a6e10>

In [23]:
import numpy as np
y_preds = model.predict(X_test)
y_preds = np.argmax(y_preds, axis=1)
y_true = np.argmax(y_test, axis=1)

In [24]:
emolabels = []
for e in emotions:
    if isinstance(e, list):
        emolabels.append(e[0])
    else:
        emolabels.append(e)

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_preds, target_names=emolabels))

             precision    recall  f1-score   support

      happy       0.56      0.56      0.56     15007
        sad       0.60      0.54      0.57     15012
    disgust       0.43      0.27      0.34     15214
      angry       0.47      0.56      0.51     14893
       fear       0.38      0.40      0.39     14888
   surprise       0.42      0.51      0.46     14986

avg / total       0.48      0.47      0.47     90000



In [28]:
examples = [
    "まじきもい、あいつ",
    "今日は楽しい一日だったよ",
    "ペットが死んだ、実に悲しい",
    "ふざけるな、死ね",
    "ストーカー怖い",
    "すごい！ほんとに！？",
    "葉は植物の構成要素です。",
    "ホームレスと囚人を集めて革命を起こしたい",
    "数学は科学に用いられます。",
    "りんごは赤い。",
    "とうもろこしは食べ物です。"
]

targets = preprocess(tokenize(examples, regexs, sp), tokenizer, maxlen=maxlen)
print('\t'.join(emolabels))
for i, ds in enumerate(model.predict(targets)):
    print('\t'.join([str(round(100.0*d)) for d in ds]))

happy	sad	disgust	angry	fear	surprise
1.0	3.0	45.0	29.0	18.0	4.0
63.0	5.0	8.0	4.0	5.0	15.0
2.0	44.0	16.0	8.0	27.0	3.0
1.0	1.0	9.0	86.0	3.0	1.0
0.0	5.0	20.0	7.0	62.0	6.0
3.0	5.0	3.0	3.0	11.0	76.0
48.0	7.0	9.0	14.0	7.0	15.0
13.0	6.0	17.0	35.0	13.0	16.0
18.0	8.0	18.0	23.0	19.0	13.0
21.0	9.0	14.0	16.0	16.0	23.0
19.0	7.0	23.0	30.0	12.0	9.0


In [29]:
model.save("model_2018-08-29-14:12.h5")

import pickle

with open("tokenizer_cnn_ja.pkl", "wb") as f:
    pickle.dump(tokenizer, f)