In [None]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SimpleRNN, SpatialDropout1D
from keras.layers import Flatten, Conv1D, MaxPooling1D, Dropout, SpatialDropout1D, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split
from sklearn import metrics
from google.colab import files

In [1]:
# Upload the CSV file
from google.colab import files
uploaded = files.upload()

In [None]:
data = pd.read_csv("Tweets.csv")

In [None]:
data = data[['text', 'airline_sentiment']]
data.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [None]:
#excluding neutral tweets to be (tow-class) task
data = data[data['airline_sentiment'] != 'neutral']

# Tweets Pre-processing 

In [None]:
def clean_train_data(x):
    text = x
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) # remove square brackets
    text = re.sub(r'[^\w\s]','',text) # remove punctuation
    text = re.sub('\w*\d\w*', '', text) # remove words containing numbers
    text = re.sub(r'\s+', ' ', text, flags=re.I) # Substituting multiple spaces with single space
    text = re.sub('\n', '', text)
    return text

In [None]:
data['text'] = data.text.apply(lambda x : clean_train_data(x))
data.head()

Unnamed: 0,text,airline_sentiment
0,virginamerica what dhepburn said,neutral
1,virginamerica plus youve added commercials to ...,positive
2,virginamerica i didnt today must mean i need t...,neutral
3,virginamerica its really aggressive to blast o...,negative
4,virginamerica and its a really big bad thing a...,negative


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Remove stopwords 
stop = set (stopwords.words ("english"))
def remove_stopwords (text): 
  text = [word.lower () for word in text.split() if word.lower() not in stop]
  return " ".join(text)

In [None]:
data["text"] = data["text"].map(remove_stopwords)

In [None]:
data["text"]

1        virginamerica plus youve added commercials exp...
3        virginamerica really aggressive blast obnoxiou...
4                       virginamerica really big bad thing
5        virginamerica seriously would pay flight seats...
6        virginamerica yes nearly every time fly vx ear...
                               ...                        
14633    americanair flight cancelled flightled leaving...
14634                         americanair right cue delays
14635       americanair thank got different flight chicago
14636    americanair leaving minutes late flight warnin...
14638    americanair money change flight dont answer ph...
Name: text, Length: 11541, dtype: object

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download('punkt') # one time execution

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
dataY = pd.get_dummies(data['airline_sentiment']).values
dX_train, dX_test, dY_train, dY_test = train_test_split(data['text'],dataY, test_size = 0.3, random_state = 42)

In [None]:
num_words=None
tokenizer = Tokenizer(num_words, split=' ') 
tokenizer.fit_on_texts(data['text'].values) 
#vocab_length = len(tokenizer.word_index) + 1 
X_train = tokenizer.texts_to_sequences(dX_train.values) 
X_train = pad_sequences(X_train)
X_test = tokenizer.texts_to_sequences(dX_test.values) 
X_test = pad_sequences(X_test)

In [None]:
#X_train

In [None]:
#to show how each word assigns to value after tokenizer 
word_index = tokenizer.word_index

In [None]:
# download pretrained GloVe word embeddings
! wget http://nlp.stanford.edu/data/glove.6B.zip

--2022-08-09 17:46:05--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-08-09 17:46:06--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-08-09 17:46:06--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
! unzip glove*.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
embeddings_dictionary = dict()
embedding_dim = 300
glove_file = open('glove.6B.300d.txt')
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [None]:
vocab_len = len(word_index) + 1 
embedding_matrix = np.zeros((vocab_len, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
from keras.metrics import Precision, Recall
from keras.callbacks import Callback,ModelCheckpoint
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
import keras.backend as K

In [None]:
def f1 (y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

# Building CNN model

In [None]:
#createmodel 
max_len= 32
model = Sequential()
model.add(Embedding(vocab_len, 300, weights= [embedding_matrix],input_length=None, trainable=False))
model.add(Dropout(0.5))

model.add(Conv1D(128,kernel_size=3,padding='same',activation='relu',strides=1))
model.add(GlobalMaxPooling1D())

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(2,activation='softmax'))


model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy', Precision(), Recall(), f1])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 300)         3393000   
                                                                 
 dropout_4 (Dropout)         (None, None, 300)         0         
                                                                 
 conv1d_2 (Conv1D)           (None, None, 128)         115328    
                                                                 
 global_max_pooling1d_2 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                      

In [None]:
#model training 
batch_size = 32
model.fit(X_train, dY_train, epochs = 10, batch_size=batch_size, verbose = 2)

In [None]:
#model testing
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = dY_test[-validation_size:]
X_test = X_test[:-validation_size]
dY_test = dY_test[:-validation_size]
score,acc, precision, recall, f1 = model.evaluate(X_test, dY_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))
print("precision: %.2f" % (precision))
print("recall: %.2f" % (recall))
print("f1: %.2f" % (f1))

15/15 - 0s - loss: 0.2020 - accuracy: 0.9071 - precision_2: 0.9071 - recall_2: 0.9071 - f1: 0.9057 - 56ms/epoch - 4ms/step
score: 0.20
acc: 0.91
precision: 0.91
recall: 0.91
f1: 0.91


In [None]:
# Predictions
predictions = model.predict(X_test)
predictions = np.round(predictions).astype(int)
predictions=np.argmax(predictions, axis=1, out=None)



In [None]:
len(dX_test)

3809

In [None]:
df_test = pd.DataFrame(dX_test[:-validation_size]).reindex()
df_test["target"] = predictions

In [None]:
len(df_test)

2309

In [None]:
df_test.shape

(2309, 2)

In [None]:
# Creating submission file 
#submission = pd.DataFrame( data['text'])
df_test.to_csv('Tweets.csv', index=False)
df_test.head(20)

Unnamed: 0,text,target
11825,usairways charged flight cancelled flightled u...,0
8105,jetblue great flight great view,1
1279,united theyre actually gate agent rude standin...,0
14343,americanair worries called back hrs late fligh...,0
3829,united thank one months ago none weird dont cl...,0
2590,united brothers luggage lost copa airlines fli...,0
11912,americanair great thanks followed,1
11544,usairways work hard making sure things flow sm...,0
3864,united told lostyou guys dont know,0
7940,jetblue bein calling jetblue respond leave num...,0
