In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
import os
from keras.models import Sequential , load_model
from keras.layers import LSTM , Dense , Embedding , Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
%matplotlib inline

In [2]:
dataset = pd.read_csv(r'Tweets.csv')
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569167658698067970,negative,1.0,Flight Attendant Complaints,0.7004,US Airways,,smash_tag,,0,"@USAirways ""Owen F"" at DCA Gate 42: #rude.","[38.85580899, -77.04174148]",2015-02-21 08:11:59 -0800,"Buffalo, NY",Eastern Time (US & Canada)
1,569920618843521024,negative,1.0,Customer Service Issue,1.0,American,,ctj823,,0,@AmericanAir How clueless is AA. Been waiting ...,,2015-02-23 10:03:58 -0800,,
2,570053736950685696,positive,0.6594,,,Southwest,,Q2wo,,0,@SouthwestAir thanks!,,2015-02-23 18:52:56 -0800,,Pacific Time (US & Canada)
3,569711632991047680,negative,1.0,Late Flight,0.6772,United,,scottychadwick,,0,@united yea they been booked on 10 next avalib...,,2015-02-22 20:13:32 -0800,,Eastern Time (US & Canada)
4,569681184898162689,negative,1.0,Can't Tell,0.6742,Southwest,,DobarNik,,0,"@SouthwestAir no you are not, you just care ab...",,2015-02-22 18:12:33 -0800,,


In [3]:
dataset = dataset[['airline_sentiment','text']]
Newdataset = dataset
dataset.head()

Unnamed: 0,airline_sentiment,text
0,negative,"@USAirways ""Owen F"" at DCA Gate 42: #rude."
1,negative,@AmericanAir How clueless is AA. Been waiting ...
2,positive,@SouthwestAir thanks!
3,negative,@united yea they been booked on 10 next avalib...
4,negative,"@SouthwestAir no you are not, you just care ab..."


In [4]:
dataset['text'].apply(lambda x: x.lower())

dataset['text'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]',"",x))
dataset['text'].tail()

14635    @AmericanAir would you like any additional det...
14636    Just got off the phone @AmericanAir customer s...
14637    @United flight 4465 almost half an hour at the...
14638    .@SouthwestAir received an error online and ha...
14639    @united Change made in just over 3 hours. For ...
Name: text, dtype: object

In [5]:
tokenizer = Tokenizer(num_words=5000, split=" ")
tokenizer.fit_on_texts(dataset['text'].values)

X =tokenizer.texts_to_sequences(dataset['text'].values)
X = pad_sequences(X)
X[:7]
# for word in tokenizer.word_index:
 #   print (word)

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   12,  876,   28,  316,   82, 1759,
         284],
       [   0,    0,    0,    0,    0,    0,   13,   65, 4706,   14,  190,
          50,  110,    1,  566,    7,   48,  139,  457,   84,    4,  268,
          35,    4,   40,   85,    8,   59,   50,    9,   66,   42,    7,
         960],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   16,
          39],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           5, 1656,   57,   50,  226,    9,  211,  179,   64,  230,  567,
        1573,   10,   63,   51, 1850,   26,   67,  638,  121,   18,  175,
         287],
       [   0,    0,    0,    0,    0,    0,    0,   

In [6]:
X.shape

(14640, 34)

In [7]:
model = Sequential()
model.add(Embedding(5000, 256 ,input_length = X.shape[1]))
model.add(Dropout(0.3))
model.add(LSTM(256 ,  return_sequences=True , dropout = 0.3 , recurrent_dropout = 0.2))
model.add(LSTM(256 , dropout = 0.3 , recurrent_dropout = 0.2))
model.add(Dense(3 , activation = 'softmax'))  # 3 because we have 3 output Negetive, Positive, Natural

In [8]:
model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam', metrics = 'accuracy')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 34, 256)           1280000   
                                                                 
 dropout (Dropout)           (None, 34, 256)           0         
                                                                 
 lstm (LSTM)                 (None, 34, 256)           525312    
                                                                 
 lstm_1 (LSTM)               (None, 256)               525312    
                                                                 
 dense (Dense)               (None, 3)                 771       
                                                                 
Total params: 2331395 (8.89 MB)
Trainable params: 2331395 (8.89 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
y = pd.get_dummies(dataset['airline_sentiment']).values
[print(dataset['airline_sentiment'][i], y[i]) for i in range(0,7)]

negative [1 0 0]
negative [1 0 0]
positive [0 0 1]
negative [1 0 0]
negative [1 0 0]
negative [1 0 0]
positive [0 0 1]


[None, None, None, None, None, None, None]

In [10]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size = 0.2 , random_state =0)

In [11]:
batch_size = 32
epochs = 10
if (not os.path.exists(r'./Model_Sentiment.h5')) :
    res = model.fit(X_train,y_train , epochs = epochs , batch_size = batch_size , verbose = 2)  #verbose if we want to print output of result
    model.save(r'./Model_Sentiment.h5')

Epoch 1/10
366/366 - 106s - loss: 0.6320 - accuracy: 0.7400 - 106s/epoch - 290ms/step
Epoch 2/10
366/366 - 105s - loss: 0.4204 - accuracy: 0.8415 - 105s/epoch - 288ms/step
Epoch 3/10
366/366 - 109s - loss: 0.3359 - accuracy: 0.8747 - 109s/epoch - 298ms/step
Epoch 4/10
366/366 - 108s - loss: 0.2746 - accuracy: 0.8966 - 108s/epoch - 296ms/step
Epoch 5/10
366/366 - 110s - loss: 0.2212 - accuracy: 0.9169 - 110s/epoch - 302ms/step
Epoch 6/10
366/366 - 106s - loss: 0.1869 - accuracy: 0.9311 - 106s/epoch - 288ms/step
Epoch 7/10
366/366 - 109s - loss: 0.1528 - accuracy: 0.9442 - 109s/epoch - 299ms/step
Epoch 8/10
366/366 - 106s - loss: 0.1309 - accuracy: 0.9548 - 106s/epoch - 291ms/step
Epoch 9/10
366/366 - 107s - loss: 0.1086 - accuracy: 0.9612 - 107s/epoch - 292ms/step
Epoch 10/10
366/366 - 108s - loss: 0.1044 - accuracy: 0.9631 - 108s/epoch - 296ms/step


  saving_api.save_model(


In [12]:
model_res = load_model(r'.//Model_Sentiment.h5')

In [13]:
loss, acc = model_res.evaluate(X, y, verbose=0)
print( 'loss: %f, acc: %f' % (loss, acc*100))

loss: 0.238294, acc: 93.811476


In [14]:
predictions = model.predict(X_test)

[print(dataset['text'][i], predictions[i] , y_test[i]) for i in range(0,9)]

@USAirways "Owen F" at DCA Gate 42: #rude. [0.7681248  0.22873592 0.00313932] [1 0 0]
@AmericanAir How clueless is AA. Been waiting to hear for 2.5 weeks about a refund from a Cancelled Flightled flight &amp; been on hold now for 1hr 49min [0.59498936 0.39127475 0.01373592] [1 0 0]
@SouthwestAir thanks! [0.13154447 0.36999533 0.49846023] [0 1 0]
@united yea they been booked on 10 next avalible flights since sat 7am. And when time comes no plane 2nd day of work missed #hotelliving [9.9962258e-01 7.1517577e-05 3.0585178e-04] [1 0 0]
@SouthwestAir no you are not, you just care about mighty dollar. [0.9441622  0.0524125  0.00342533] [0 1 0]
@united not even mentioning how rude the customer service was to us. As a business owner, I'd be mortified if my employees acted as yours [9.9977666e-01 1.8758902e-04 3.5803016e-05] [1 0 0]
@SouthwestAir Ahah😃💕🎵 That is why
I love SW✈❗(^^)❤ [9.9995363e-01 3.3524160e-05 1.2903728e-05] [1 0 0]
@united flight 3870 to Newark, stuck in the runway. About to m

[None, None, None, None, None, None, None, None, None]

In [15]:
predictions.shape

(2928, 3)

In [16]:
# A few random samples
samples_to_predict = []
newsamples = ["@Thanks Mo, I am glad that I used your services"]


In [17]:
sample = pd.Series(data=newsamples)
sample

0    @Thanks Mo, I am glad that I used your services
dtype: object

In [18]:
#sample = pd.Series(newsamples)
pd = Newdataset['text'].append(sample , ignore_index=True)
pd

  pd = Newdataset['text'].append(sample , ignore_index=True)


0               @USAirways "Owen F" at DCA Gate 42: #rude.
1        @AmericanAir How clueless is AA. Been waiting ...
2                                    @SouthwestAir thanks!
3        @united yea they been booked on 10 next avalib...
4        @SouthwestAir no you are not, you just care ab...
                               ...                        
14636    Just got off the phone @AmericanAir customer s...
14637    @United flight 4465 almost half an hour at the...
14638    .@SouthwestAir received an error online and ha...
14639    @united Change made in just over 3 hours. For ...
14640      @Thanks Mo, I am glad that I used your services
Length: 14641, dtype: object

In [19]:
pd.apply(lambda x: x.lower())
pd.apply(lambda x: re.sub('[^a-zA-Z0-9\s]'," ",x))
Own= pd.values
Own

array(['@USAirways "Owen F" at DCA Gate 42: #rude.',
       '@AmericanAir How clueless is AA. Been waiting to hear for 2.5 weeks about a refund from a Cancelled Flightled flight &amp; been on hold now for 1hr 49min',
       '@SouthwestAir thanks!', ...,
       '.@SouthwestAir received an error online and have been on hold for over 1.5 hours. Completely unacceptable.',
       '@united Change made in just over 3 hours. For something that should have taken seconds online, I am not thrilled. Loved the agent, though.',
       '@Thanks Mo, I am glad that I used your services'], dtype=object)

In [20]:
tokenizer = Tokenizer(num_words=5000, split=" ")
tokenizer.fit_on_texts(Own)

X =tokenizer.texts_to_sequences(Own)
X = pad_sequences(X)
#NewData = X[-1]
X

array([[   0,    0,    0, ...,   82, 1759,  284],
       [   0,    0,    0, ...,   42,    7,  960],
       [   0,    0,    0, ...,    0,   16,   39],
       ...,
       [   0,    0,    0, ...,   60,  783,  409],
       [   0,    0,    0, ...,    2,  157,  381],
       [   0,    0,    0, ...,  527,   21,  850]], dtype=int32)

In [21]:
NewData = X[-1].reshape(1,34)

In [22]:
NewData

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  39,
          3,  96, 611,  27,   3, 527,  21, 850]], dtype=int32)

In [23]:
prediction = model.predict(NewData)
print(prediction) 

[[0.03900107 0.05238442 0.9086145 ]]


In [24]:
prediction.shape

(1, 3)