In [1]:
import pandas as pd
df = pd.read_csv("../data/sentiment140.csv", header=None, error_bad_lines=False, encoding='latin1')

df = df[[0, 5]]
df.columns = ["label", "text"]
df['label'] = df.label.factorize()[0]

In [3]:
from sklearn.utils import shuffle
df = shuffle(df, random_state=42)
df_sample = df.iloc[:]
df_sample['text'] = df_sample['text'].apply(lambda x: x.lower())

In [4]:
import numpy as np
def encoding(texts, max_len=None):
    tx2chs = [list(text) for text in texts]
    if max_len is None:
        max_len = max([len(chs) for chs in tx2chs])
    vectors = [list(map(ord, chs))+[0 for _ in range(max_len-len(chs))] for chs in tx2chs]
    max_features = max([max(x) for x in vectors])
    return np.array(vectors), max_len, max_features

X, max_len, max_features = encoding(df_sample['text'])

In [12]:
max_len, max_features

(374, 251)

In [5]:
from keras.utils import to_categorical
y = to_categorical(df_sample['label'])

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
from keras.layers import Input, Dense, Embedding, Flatten
from keras.layers import SpatialDropout1D
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import Sequential

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)

In [8]:
model = Sequential()
model.add(Embedding(max_features+1, 150, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [9]:
epochs = 30
batch_size = 2000
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size)

Train on 900000 samples, validate on 300000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f3c3e1944e0>

In [14]:
y_preds = model.predict(X_test)
y_preds = np.argmax(y_preds, axis=1)
y_true = np.argmax(y_test, axis=1)

In [15]:
from sklearn.metrics import roc_auc_score, accuracy_score
roc_auc_score(y_true, y_preds), accuracy_score(y_true, y_preds)

(0.7569090255293192, 0.7568025)

In [16]:
model.save("model_2018-08-27-10:22.h5")

In [17]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size)

Train on 900000 samples, validate on 300000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa441230908>

In [18]:
y_preds = model.predict(X_test)
y_preds = np.argmax(y_preds, axis=1)

In [19]:
roc_auc_score(y_true, y_preds), accuracy_score(y_true, y_preds)

(0.7799538228919617, 0.7799725)

In [20]:
model.save("model_2018-08-27-10:25.h5")

In [7]:
from keras.models import load_model
model = load_model("model_2018-08-27-10:25.h5")

In [10]:
epochs = 10
batch_size = 2000
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size)

Train on 900000 samples, validate on 300000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f37919cf320>

In [12]:
y_preds = model.predict(X_test)
y_preds = np.argmax(y_preds, axis=1)

In [13]:
from sklearn.metrics import roc_auc_score, accuracy_score
y_true = np.argmax(y_test, axis=1)
roc_auc_score(y_true, y_preds), accuracy_score(y_true, y_preds)

(0.7893832450051664, 0.7893125)

In [14]:
model.save("model_2018-08-27-10:46.h5")

In [15]:
epochs = 10
batch_size = 2000
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size)

Train on 900000 samples, validate on 300000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f37901f2cf8>

In [16]:
y_preds = model.predict(X_test)
y_preds = np.argmax(y_preds, axis=1)

In [17]:
roc_auc_score(y_true, y_preds), accuracy_score(y_true, y_preds)

(0.796649954004895, 0.7966525)

In [18]:
model.save("model_2018-08-27-10:55.h5")

In [19]:
epochs = 10
batch_size = 2000
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size)

Train on 900000 samples, validate on 300000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f37901f2e48>

In [20]:
y_preds = model.predict(X_test)
y_preds = np.argmax(y_preds, axis=1)
roc_auc_score(y_true, y_preds), accuracy_score(y_true, y_preds)

(0.7973590883686956, 0.7974025)

In [21]:
model.save("model_2018-08-27-11:15.h5")

In [26]:
examples = ["Fuck you asshole.", "I am happy", "very scared...", "why do people hate Trump?"]
example_enc,_,_ = encoding(examples, max_len)
model.predict(example_enc)

array([[0.7620187 , 0.23798132],
       [0.07904999, 0.92095006],
       [0.9092788 , 0.09072125],
       [0.9782016 , 0.02179843]], dtype=float32)

(340700, 10)