In [None]:
# Basic packages
import pandas as pd 
import numpy as np
import re
import collections
import os
# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

max_words = 10000  # Parameter indicating the number of words we'll put in the dictionary

df = pd.read_csv('input/airline/Tweets.csv')
df = df.reindex(np.random.permutation(df.index))  
df = df[['text', 'airline_sentiment']]

import nltk
nltk.download('stopwords')

def remove_stopwords(input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 
    
def remove_mentions(input_text):
        return re.sub(r'@\w+', '', input_text)
       
df.text = df.text.apply(remove_stopwords).apply(remove_mentions)

X_train, X_test, y_train, y_test = train_test_split(df.text, df.airline_sentiment, test_size=0.1, random_state=37)
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

tk = Tokenizer(num_words=max_words,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)

from keras.preprocessing.sequence import pad_sequences
data = pad_sequences(X_train_seq, maxlen=26)
labels = np.asarray(y_train_le)

training_samples = 10248  # We will be training on 10248 samples
validation_samples = 2928

# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

onehot = np.zeros((data.shape[0], max_words))
for i in range(data.shape[0]):
    for j in range(26):
        temp2 = data [i]
        if temp2[j] != 0:
            a = temp2[j] - 1
            onehot[i][a] = 1
            
from sklearn.model_selection import train_test_split
from sklearn.metrics.classification import accuracy_score
from sklearn.metrics import confusion_matrix


# Loading dataset
X, Y = onehot, labels
# Data scaling
X = (X).astype(np.float32)

# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=2/9, random_state=0)

In [None]:
def toNpBin(var,varName):
    if not os.path.exists('npbin_airline'):
        os.mkdir('npbin_airline')
    #else:
        #os.system('rm ./npbin/*.npbin')
 
    typeStr = str(var.dtype)
    if typeStr=='float64':
        typeStr='double'
 
    shape = list(var.shape)
    #pdb.set_trace()
    if len(shape)==1:
        shapeArray=[shape[0],1]
        filename = './npbin_airline/'+varName+'_'+str(shapeArray)[1:-1].replace(', ','-')+ \
                    '_' + typeStr + '_' + '.npbin'
        var.tofile(filename)
    else:
        #shape.reverse()
        shapeArray=shape
        #shapeArray[0], shapeArray[1] = shapeArray[1], shapeArray[0]
        filename = './npbin_airline/'+varName+'_'+str(shapeArray)[1:-1].replace(', ','-')+ \
                    '_' + typeStr + '_' + '.npbin'
        var.tofile(filename)
 
    print ('write ' + varName +' done!')

In [None]:
toNpBin(X_train,'traindata')
toNpBin(X_test,'testdata')
toNpBin(Y_train,'trainlabel')
toNpBin(Y_test,'testlabel')