In [1]:
# Basic packages
import pandas as pd 
import numpy as np
import re
import collections

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

NB_WORDS = 10000  # Parameter indicating the number of words we'll put in the dictionary

df = pd.read_csv('Tweets.csv')
df = df.reindex(np.random.permutation(df.index))  
df = df[['text', 'airline_sentiment']]

import nltk
nltk.download('stopwords')

def remove_stopwords(input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 
    
def remove_mentions(input_text):
        return re.sub(r'@\w+', '', input_text)
       
df.text = df.text.apply(remove_stopwords).apply(remove_mentions)

X_train, X_test, y_train, y_test = train_test_split(df.text, df.airline_sentiment, test_size=0.1, random_state=37)
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)

from keras.preprocessing.sequence import pad_sequences
data = pad_sequences(X_train_seq, maxlen=26)
labels = np.asarray(y_train_le)

training_samples = 10248  # We will be training on 2000 samples
validation_samples = 2928

# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

onehot = np.zeros((data.shape[0], NB_WORDS))
for i in range(data.shape[0]):
    for j in range(26):
        temp2 = data [i]
        if temp2[j] != 0:
            a = temp2[j]
            onehot[i][a] = 1
            
from sklearn.model_selection import train_test_split
from sklearn.metrics.classification import accuracy_score
from sklearn.metrics import confusion_matrix
from models import SupervisedDBNClassification


# Loading dataset
X, Y = onehot, labels
# Data scaling
X = (X).astype(np.float32)

# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=2/9, random_state=0)
# Training
classifier = SupervisedDBNClassification(hidden_layers_structure=[256, 256],
                                         learning_rate_rbm=0.05,
                                         learning_rate=0.1,
                                         n_epochs_rbm=10,
                                         n_iter_backprop=10,
                                         batch_size=128,
                                         activation_function='relu',
                                         dropout_p=0.05)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liyuantan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
%%time
classifier.fit(X_train, Y_train)

[START] Pre-training step:
>> Epoch 1 finished 	RBM Reconstruction error 9.692118
>> Epoch 2 finished 	RBM Reconstruction error 9.684284
>> Epoch 3 finished 	RBM Reconstruction error 9.649483
>> Epoch 4 finished 	RBM Reconstruction error 9.505414
>> Epoch 5 finished 	RBM Reconstruction error 9.295899
>> Epoch 6 finished 	RBM Reconstruction error 9.254849
>> Epoch 7 finished 	RBM Reconstruction error 9.237298
>> Epoch 8 finished 	RBM Reconstruction error 9.210901
>> Epoch 9 finished 	RBM Reconstruction error 9.168770
>> Epoch 10 finished 	RBM Reconstruction error 9.103865
>> Epoch 1 finished 	RBM Reconstruction error 0.462211
>> Epoch 2 finished 	RBM Reconstruction error 0.333413
>> Epoch 3 finished 	RBM Reconstruction error 0.138130
>> Epoch 4 finished 	RBM Reconstruction error 0.093934
>> Epoch 5 finished 	RBM Reconstruction error 0.090670
>> Epoch 6 finished 	RBM Reconstruction error 0.088061
>> Epoch 7 finished 	RBM Reconstruction error 0.087037
>> Epoch 8 finished 	RBM Reconstructi

SupervisedDBNClassification()

In [3]:
x_test = pad_sequences(X_test_seq, maxlen=26)
y_test = np.asarray(y_test_le)

onehot_test = np.zeros((x_test.shape[0], NB_WORDS))
for i in range(x_test.shape[0]):
    for j in range(26):
        temp2 = x_test [i]
        if temp2[j] != 0:
            a = temp2[j]
            onehot_test[i][a] = 1
            
x_test = (onehot_test).astype(np.float32)

In [4]:
%%time
y_pred = classifier.predict(x_test)
print('Done.\nAccuracy: %f' % accuracy_score(y_test, y_pred))

Done.
Accuracy: 0.754781
CPU times: user 251 ms, sys: 43.4 ms, total: 295 ms
Wall time: 176 ms
