In [2]:
import tensorflow

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [4]:
df = pd.read_csv('tweets.csv')

In [5]:
df.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [5]:
X = df['text']
y = df['airline_sentiment']

In [6]:
X

0                      @VirginAmerica What @dhepburn said.
1        @VirginAmerica plus you've added commercials t...
2        @VirginAmerica I didn't today... Must mean I n...
3        @VirginAmerica it's really aggressive to blast...
4        @VirginAmerica and it's a really big bad thing...
                               ...                        
14635    @AmericanAir thank you we got on a different f...
14636    @AmericanAir leaving over 20 minutes Late Flig...
14637    @AmericanAir Please bring American Airlines to...
14638    @AmericanAir you have my money, you change my ...
14639    @AmericanAir we have 8 ppl so we need 2 know h...
Name: text, Length: 14640, dtype: object

In [7]:

y

0         neutral
1        positive
2         neutral
3        negative
4        negative
           ...   
14635    positive
14636    negative
14637     neutral
14638    negative
14639     neutral
Name: airline_sentiment, Length: 14640, dtype: object

In [30]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [31]:
y

array([1, 2, 1, ..., 1, 0, 1])

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

In [11]:
tokenizer.word_index

{'to': 1,
 'the': 2,
 'i': 3,
 'a': 4,
 'united': 5,
 'you': 6,
 'for': 7,
 'flight': 8,
 'on': 9,
 'and': 10,
 'my': 11,
 'usairways': 12,
 'americanair': 13,
 'is': 14,
 'in': 15,
 'southwestair': 16,
 'jetblue': 17,
 'of': 18,
 'me': 19,
 'it': 20,
 'your': 21,
 'have': 22,
 'was': 23,
 'not': 24,
 'with': 25,
 'no': 26,
 'that': 27,
 'at': 28,
 'this': 29,
 'get': 30,
 'but': 31,
 't': 32,
 'co': 33,
 'be': 34,
 'from': 35,
 'http': 36,
 'can': 37,
 'are': 38,
 'thanks': 39,
 'cancelled': 40,
 'we': 41,
 'now': 42,
 'an': 43,
 'just': 44,
 'service': 45,
 'do': 46,
 'so': 47,
 '2': 48,
 'help': 49,
 'been': 50,
 'time': 51,
 'will': 52,
 'customer': 53,
 'up': 54,
 'out': 55,
 'our': 56,
 'they': 57,
 "i'm": 58,
 'amp': 59,
 'hours': 60,
 'us': 61,
 'what': 62,
 'when': 63,
 'flights': 64,
 'how': 65,
 'hold': 66,
 'plane': 67,
 'if': 68,
 'all': 69,
 'why': 70,
 'thank': 71,
 'still': 72,
 'there': 73,
 'one': 74,
 'please': 75,
 'need': 76,
 'would': 77,
 'delayed': 78,
 "can't":

In [12]:
len(tokenizer.word_index)

15768

In [13]:
input_sequence = tokenizer.texts_to_sequences(X)

In [14]:
input_sequence

[[81, 62, 6686, 226],
 [81, 558, 590, 1159, 2536, 1, 2, 201, 6687],
 [81, 3, 207, 102, 805, 591, 3, 76, 1, 156, 150, 193],
 [81,
  89,
  136,
  3792,
  1,
  4706,
  4707,
  1009,
  15,
  21,
  6688,
  3793,
  59,
  57,
  22,
  503,
  2798],
 [81, 10, 89, 4, 136, 476, 214, 487, 84, 20],
 [81,
  439,
  77,
  287,
  221,
  4,
  8,
  7,
  199,
  27,
  207,
  22,
  29,
  2114,
  89,
  136,
  2,
  118,
  214,
  487,
  84,
  130,
  1850],
 [81, 171, 1483, 315, 51, 3, 108, 2294, 29, 6689, 6690, 2295, 133, 462],
 [81,
  136,
  288,
  4,
  3197,
  1759,
  7,
  3794,
  338,
  3795,
  6691,
  73,
  576,
  32,
  33,
  6692],
 [81, 236, 3, 6693, 42, 3, 46, 1123],
 [81, 20, 23, 364, 10, 577, 43, 91, 358, 238, 175, 140, 1, 19],
 [81,
  122,
  6,
  112,
  27,
  2537,
  14,
  2,
  618,
  2115,
  785,
  18,
  1484,
  1760,
  2538,
  211,
  425],
 [81, 3, 743, 101, 698, 6694, 47, 180, 220, 166, 6695, 6696, 1123],
 [81,
  29,
  14,
  578,
  4,
  119,
  656,
  222,
  1485,
  84,
  11,
  638,
  193,
  1,
  1

In [15]:
max_len = max([len(x) for x in input_sequence])
max_len

36

In [None]:

padded_input_sequence = pad_sequences(input_sequence, maxlen = max_len, padding='pre')

In [17]:
padded_input_sequence.shape

(14640, 36)

In [32]:
y.shape

(14640,)

numpy.ndarray

In [33]:
np.unique(y)

array([0, 1, 2])

In [40]:
y.dtype

dtype('int64')

In [41]:
X.dtype

dtype('O')

In [52]:
from tensorflow.keras.layers import Embedding, LSTM, Dense
model = Sequential()
model.add(Embedding(15769, 100,input_length=max_len))
model.add(LSTM(150,return_sequences=True))
model.add(LSTM(120))
model.add(Dense(3,activation='softmax'))

In [53]:
model.compile(loss = 'sparse_categorical_crossentropy',optimizer = 'adam',metrics =  ['accuracy'])

In [55]:
history = model.fit(padded_input_sequence,y, validation_split=0.20, epochs=20)

Epoch 1/20
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 105ms/step - accuracy: 0.9959 - loss: 0.0125 - val_accuracy: 0.7906 - val_loss: 1.4571
Epoch 2/20
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 102ms/step - accuracy: 0.9955 - loss: 0.0122 - val_accuracy: 0.7742 - val_loss: 1.6422
Epoch 3/20
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 102ms/step - accuracy: 0.9955 - loss: 0.0131 - val_accuracy: 0.7824 - val_loss: 1.3331
Epoch 4/20
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 100ms/step - accuracy: 0.9923 - loss: 0.0218 - val_accuracy: 0.7794 - val_loss: 1.2710
Epoch 5/20
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 110ms/step - accuracy: 0.9962 - loss: 0.0116 - val_accuracy: 0.7753 - val_loss: 1.4500
Epoch 6/20
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 108ms/step - accuracy: 0.9958 - loss: 0.0111 - val_accuracy: 0.7637 - val_loss: 1.4432
Epoch 7/20

In [56]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example new tweet
new_tweet = ["I absolutely love this product!"]

# Convert text to sequence using the same tokenizer
new_seq = tokenizer.texts_to_sequences(new_tweet)

# Pad the sequence
padded_seq = pad_sequences(new_seq, maxlen=36, padding='pre')


In [57]:
pred_prob = model.predict(padded_seq)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


In [59]:
pred_prob

array([[9.4066199e-06, 1.3722008e-05, 9.9997687e-01]], dtype=float32)

In [64]:
import numpy as np

predicted_class = int(np.argmax(pred_prob, axis=1))  #[0]  # returns 0, 1, or 2


  predicted_class = int(np.argmax(pred_prob, axis=1))  #[0]  # returns 0, 1, or 2


In [65]:
predicted_class

2

In [66]:
label_map = {
    0: 'negative',
    1: 'neutral',
    2: 'positive'
}

print("Predicted Sentiment:", label_map[predicted_class])


Predicted Sentiment: positive
