In [7]:
import pandas as pd
from os.path import join
from sklearn.utils import shuffle

emotions = ["happy", "sad", ["disgust", "disgust2"], "angry", "fear", "surprise"]
dir_path = "gathering/ja_tweets_sentiment"
size = 60000
df = []
for i, es in enumerate(emotions):
    if isinstance(es, list):
        for e in es:
            data = shuffle(pd.read_json(join(dir_path, "{}.json".format(e)))).iloc[:int(size/len(es))]
            data['label'] = i
            df.append(data)
    else:
        data = shuffle(pd.read_json(join(dir_path, "{}.json".format(es)))).iloc[:int(size)]
        data['label'] = i
        df.append(data)

df = pd.concat(df)
df.shape

(360000, 11)

In [13]:
df = shuffle(df)

In [14]:
X = df['text']
y = df['label']

In [15]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

max_features=10000
maxlen = 280

y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y)

tokenizer = Tokenizer(num_words=max_features, filters="", char_level=True)
tokenizer.fit_on_texts(list(X_train))

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=maxlen)

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)

In [18]:
X_train.shape, X_val.shape, X_test.shape

((202500, 280), (67500, 280), (90000, 280))

In [19]:
from keras.layers import Input, Dense, Embedding, Flatten
from keras.layers import SpatialDropout1D
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import Sequential

In [20]:
model = Sequential()
model.add(Embedding(max_features, 150, input_length=maxlen))
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(6, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [21]:
epochs = 5
batch_size = 1000

In [23]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size)

Train on 202500 samples, validate on 67500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3052643e80>

In [25]:
import numpy as np
y_preds = model.predict(X_test)
y_preds = np.argmax(y_preds, axis=1)
y_true = np.argmax(y_test, axis=1)

In [26]:
emolabels = []
for e in emotions:
    if isinstance(e, list):
        emolabels.append(e[0])
    else:
        emolabels.append(e)

In [28]:
from sklearn.metrics import classification_report

In [30]:
print(classification_report(y_true, y_preds, target_names=emolabels))

             precision    recall  f1-score   support

      happy       0.54      0.55      0.54     15019
        sad       0.61      0.50      0.55     14984
    disgust       0.39      0.48      0.43     14978
      angry       0.41      0.62      0.49     15017
       fear       0.58      0.42      0.49     15106
   surprise       0.51      0.34      0.41     14896

avg / total       0.50      0.48      0.48     90000



In [31]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size)

Train on 202500 samples, validate on 67500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3052790ef0>

In [32]:
y_preds = model.predict(X_test)
y_preds = np.argmax(y_preds, axis=1)

In [33]:
print(classification_report(y_true, y_preds, target_names=emolabels))

             precision    recall  f1-score   support

      happy       0.59      0.48      0.53     15019
        sad       0.50      0.59      0.54     14984
    disgust       0.40      0.48      0.44     14978
      angry       0.43      0.62      0.51     15017
       fear       0.56      0.45      0.50     15106
   surprise       0.56      0.32      0.41     14896

avg / total       0.51      0.49      0.49     90000



In [34]:
model.save("models/model_2018-08-28-9:33.h5")

In [35]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size)

Train on 202500 samples, validate on 67500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3052790e48>

In [36]:
y_preds = model.predict(X_test)
y_preds = np.argmax(y_preds, axis=1)

In [37]:
print(classification_report(y_true, y_preds, target_names=emolabels))

             precision    recall  f1-score   support

      happy       0.52      0.60      0.56     15019
        sad       0.56      0.54      0.55     14984
    disgust       0.40      0.49      0.44     14978
      angry       0.45      0.60      0.52     15017
       fear       0.66      0.38      0.48     15106
   surprise       0.51      0.38      0.44     14896

avg / total       0.52      0.50      0.50     90000



In [38]:
model.save("models/model_2018-08-28-9:34.h5")

In [39]:
def preprocess(data, tokenizer, maxlen=280):
    return(pad_sequences(tokenizer.texts_to_sequences(data), maxlen=maxlen))

In [54]:
examples = [
    "まじきもい、あいつ",
    "今日は楽しい一日だったよ",
    "ペットが死んだ、実に悲しい",
    "ふざけるな、死ね",
    "ストーカー怖い",
    "すごい！ほんとに！？"
]

targets = preprocess(examples, tokenizer, maxlen=maxlen)
print('\t'.join(emolabels))
for i, ds in enumerate(model.predict(targets)):
    print('\t'.join([str(round(100.0*d)) for d in ds]))



happy	sad	disgust	angry	fear	surprise
2.0	3.0	7.0	4.0	3.0	2.0
10.0	1.0	3.0	1.0	1.0	1.0
0.0	23.0	5.0	0.0	1.0	0.0
1.0	2.0	9.0	13.0	3.0	1.0
0.0	2.0	12.0	2.0	12.0	1.0
1.0	2.0	2.0	2.0	2.0	7.0
