In [1]:
# Basic packages
import pandas as pd 
import numpy as np
import re
import collections
import os
# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

NB_WORDS = 10000  # Parameter indicating the number of words we'll put in the dictionary

df = pd.read_csv('Tweets.csv')
df = df.reindex(np.random.permutation(df.index))  
df = df[['text', 'airline_sentiment']]

import nltk
nltk.download('stopwords')

def remove_stopwords(input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 
    
def remove_mentions(input_text):
        return re.sub(r'@\w+', '', input_text)
       
df.text = df.text.apply(remove_stopwords).apply(remove_mentions)

X_train, X_test, y_train, y_test = train_test_split(df.text, df.airline_sentiment, test_size=0.1, random_state=37)
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)

from keras.preprocessing.sequence import pad_sequences
data = pad_sequences(X_train_seq, maxlen=26)
labels = np.asarray(y_train_le)

training_samples = 10248  # We will be training on 2000 samples
validation_samples = 2928

# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

onehot = np.zeros((data.shape[0], NB_WORDS))
for i in range(data.shape[0]):
    for j in range(26):
        temp2 = data [i]
        if temp2[j] != 0:
            a = temp2[j]
            onehot[i][a] = 1
            
from sklearn.model_selection import train_test_split
from sklearn.metrics.classification import accuracy_score
from sklearn.metrics import confusion_matrix
from models import SupervisedDBNClassification


# Loading dataset
X, Y = onehot, labels
# Data scaling
X = (X).astype(np.float32)

# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=2/9, random_state=0)
#X_train = X[1000:5000]
#X_test = X[0:1000]
#Y_train = Y[1000:5000]
#Y_test= Y[0:1000]
print (X_train.shape)
print (X_test.shape)
print (Y_train.shape)
print (Y_test.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liyuantan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
(10248, 10000)
(2928, 10000)
(10248,)
(2928,)


In [2]:
x_test = pad_sequences(X_test_seq, maxlen=26)
y_test = np.asarray(y_test_le)

onehot_test = np.zeros((x_test.shape[0], NB_WORDS))
for i in range(x_test.shape[0]):
    for j in range(26):
        temp2 = x_test [i]
        if temp2[j] != 0:
            a = temp2[j]
            onehot_test[i][a] = 1
            
x_test = (onehot_test).astype(np.float32)

In [3]:
def toNpBin(var,varName):
    if not os.path.exists('npbin_test'):
        os.mkdir('npbin_test')
    #else:
        #os.system('rm ./npbin/*.npbin')
 
    typeStr = str(var.dtype)
    if typeStr=='float64':
        typeStr='double'
 
    shape = list(var.shape)
    #pdb.set_trace()
    if len(shape)==1:
        shapeArray=[shape[0],1]
        filename = './npbin_test/'+varName+'_'+str(shapeArray)[1:-1].replace(', ','-')+ \
                    '_' + typeStr + '_' + '.npbin_test'
        var.tofile(filename)
    else:
        #shape.reverse()
        shapeArray=shape
        #shapeArray[0], shapeArray[1] = shapeArray[1], shapeArray[0]
        filename = './npbin_test/'+varName+'_'+str(shapeArray)[1:-1].replace(', ','-')+ \
                    '_' + typeStr + '_' + '.npbin_test'
        var.tofile(filename)
 
    print ('write ' + varName +' done!')

In [4]:
toNpBin(X_train,'traindata')
toNpBin(x_test,'testdata')
toNpBin(Y_train,'trainlabel')
toNpBin(y_test,'testlabel')

write traindata done!
write testdata done!
write trainlabel done!
write testlabel done!
