In [2]:
#import libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import re
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SimpleRNN, SpatialDropout1D
from keras.layers import Flatten, Conv1D, MaxPooling1D, Dropout, Bidirectional, GRU, SpatialDropout1D, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split
from sklearn import metrics
from google.colab import files
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt') 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Upload the CSV file
uploaded = files.upload()

Saving Tweets.csv to Tweets.csv


In [14]:
data = pd.read_csv("Tweets.csv")

In [15]:
data = data[['text', 'airline_sentiment', 'airline']]

In [23]:
#excluding neutral tweets 
data = data[data['airline_sentiment'] != 'neutral']

# Tweets Pre-processing  

In [17]:
#noisy removal 
def clean_train_data(x):
    text = x
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) # remove square brackets
    text = re.sub(r'[^\w\s]','',text) # remove punctuation
    text = re.sub('\w*\d\w*', '', text) # remove words containing numbers
    text = re.sub(r'\s+', ' ', text, flags=re.I) # Substituting multiple spaces with single space
    text = re.sub('\n', '', text)
    return text

In [18]:
data['text'] = data.text.apply(lambda x : clean_train_data(x))

In [19]:
#Remove stopwords 
stop = set (stopwords.words ("english"))
def remove_stopwords (text): 
  text = [word.lower () for word in text.split() if word.lower() not in stop]
  return " ".join(text)

In [20]:
data["text"] = data["text"].map(remove_stopwords)

In [21]:
#input/output determination and dataset spiliting 
dataY = pd.get_dummies(data['airline_sentiment']).values
dX_train, dX_test, dY_train, dY_test = train_test_split(data['text'],dataY, test_size = 0.3, random_state = 42)

In [24]:
#tweet words tokenization 
num_words= None
tokenizer = Tokenizer(num_words, split=' ') 
tokenizer.fit_on_texts(data['text'].values) 
X_train = tokenizer.texts_to_sequences(dX_train.values) 
X_train = pad_sequences(X_train)
X_test = tokenizer.texts_to_sequences(dX_test.values) 
X_test = pad_sequences(X_test)

In [28]:
word_index = tokenizer.word_index 

**GloVe** 

In [30]:
# download pretrained GloVe word embeddings
! wget http://nlp.stanford.edu/data/glove.6B.zip

--2023-02-12 22:32:07--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-02-12 22:32:08--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-02-12 22:32:08--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [31]:
! unzip glove*.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [32]:
embeddings_dictionary = dict()
embedding_dim = 300
glove_file = open('glove.6B.300d.txt')
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [33]:
#assign each word to its crossponding GloVe word vector 
vocab_len = len(word_index) + 1 
embedding_matrix = np.zeros((vocab_len, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# Sentiment Classifier 

In [35]:
# Creating the CNN-LSTN classifier model
embed_dim = 300
lstm_out = 300
model = Sequential()
model.add(Embedding(vocab_len, embed_dim, input_length=None, weights= [embedding_matrix], trainable=False)) 
model.add(SpatialDropout1D(0.5))
model.add(Conv1D(filters=16, kernel_size=2, padding='same', activation='relu', use_bias=True))  #"same" padding option is: output_shape = input_shape / strides
model.add(MaxPooling1D(pool_size=4, strides=1, data_format="channels_last"))
model.add(Bidirectional(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))) 
#model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 300)         3393000   
                                                                 
 spatial_dropout1d_1 (Spatia  (None, None, 300)        0         
 lDropout1D)                                                     
                                                                 
 conv1d_1 (Conv1D)           (None, None, 16)          9616      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, None, 16)         0         
 1D)                                                             
                                                                 
 bidirectional_1 (Bidirectio  (None, 600)              760800    
 nal)                                                            
                                                      

In [36]:
#Training the model 
batch_size = 32
model.fit(X_train, dY_train, epochs =10 , batch_size=batch_size) #the epochs numbers can be more than 10 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2010426100>

In [37]:
#Applying the trained model to new tweets
validation_size = 1500
X_validate = X_test[-validation_size:]
Y_validate = dY_test[-validation_size:]
X_test = X_test[:-validation_size]
dY_test = dY_test[:-validation_size]

In [None]:
predictions = model.predict(X_test) 
predictions = np.round(predictions).astype(int) 
predictions=np.argmax(predictions, axis=1, out=None) 



In [None]:
df_test = pd.DataFrame(dX_test[:-validation_size],).reindex()
df_test["target"] = predictions

In [None]:
# Creating submission file 
#submission = pd.DataFrame( data['text'])
df_test.to_csv('TweetsSub.csv', index=False)
df_test.head()

Unnamed: 0,text,target
4794,southwestair youre early frontrunner best airl...,1
10480,usairways flt ewr cancelled flightled yet flts...,0
8067,jetblue going bdl dca flights yesterday today ...,1
8880,jetblue depart washington dc,1
8292,jetblue probably find ticket,1


# Importing LSA-based Summarizer to summarize Postive tweets and negative tweets as well 

In [None]:
from google.colab import drive
drive.mount('/content/drive')
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
my_module = drive.CreateFile({'id':'1ANyXteyGInSGy4lGbEMcAcbOaqC6YGU4'})

In [None]:
my_module.GetContentFile('sumlsa.py')

# LSA-based Summarizer

In [None]:
#importing lsa summarizer that we built in other module
from sumlsa import Summarization

In [None]:
#extracting negative tweets that classified by the cnn-lstm model to feed into lst summarizer.. we can feed any subset of the dataframe by selecting that subset
neg_delta = df_test [(df_test ['target'] == 0)] # negative tweets=0 , positive tweets =1 

In [None]:
features = neg_delta.iloc[:, 0].values #tweets text
features 

array(['usairways flt ewr cancelled flightled yet flts nyc usairways still flying',
       'united still waiting hear back wallet stolen one planes would appreciate resolution',
       'united yes flight rebooked im losing trust want get anywhere time',
       ...,
       'united son left tablet plane country call easily email contact',
       'americanair usairways us 728feb 21 unprofessional unprepared unsympathetic lacked communication lacked solutions',
       'united told united carry overcarry back refund'], dtype=object)

In [None]:
number_of_topics=2
words=20
document_list= features
clean_text= Summarization.preprocess_data(document_list)
dict1,doc_term_matrix= Summarization.prepare_corpus(clean_text)
model= Summarization.create_gensim_lsa_model(clean_text,number_of_topics,words)
corpus_lsi = model[doc_term_matrix] #applying lsa 

[(0, '0.755*"flight" + 0.300*"unit" + 0.216*"usairway" + 0.197*"cancel" + 0.168*"americanair" + 0.132*"get" + 0.128*"hour" + 0.126*"delay" + 0.115*"southwestair" + 0.087*"time" + 0.077*"jetblu" + 0.077*"help" + 0.077*"flightl" + 0.068*"late" + 0.068*"wait" + 0.066*"servic" + 0.064*"hold" + 0.063*"custom" + 0.060*"call" + 0.058*"book"'), (1, '0.863*"unit" + -0.344*"flight" + -0.153*"usairway" + -0.133*"cancel" + -0.092*"americanair" + 0.082*"bag" + -0.072*"southwestair" + 0.064*"get" + 0.062*"time" + 0.057*"servic" + 0.057*"delay" + -0.051*"flightl" + 0.051*"plane" + 0.049*"custom" + 0.047*"gate" + 0.041*"one" + 0.040*"airlin" + 0.040*"im" + 0.039*"fli" + 0.038*"wait"')]


In [None]:
for doc, as_text in zip(corpus_lsi, document_list): 
   print(doc, as_text)

[(0, 0.8359288933601552), (1, -0.39660750673316647)] usairways flt ewr cancelled flightled yet flts nyc usairways still flying
[(0, 0.5869940079869354), (1, 1.0954561637758786)] united still waiting hear back wallet stolen one planes would appreciate resolution
[(0, 1.3946836151836444), (1, 0.71521788355045)] united yes flight rebooked im losing trust want get anywhere time
[(0, 0.4858099710965537), (1, 1.0969680159688533)] united 100 sure however ticket included one checked bag therefore charge extra completely unanticipated
[(0, 0.5728122152026683), (1, -0.060257779923452964)] usairways ive hold change date ticket 3 hours someone please assist unacceptable
[(0, 0.3745815899726455), (1, -0.0662363639344741)] usairways oh well ill get cancun eventually
[(0, 1.1285316853727347), (1, -0.4038368942671172)] usairways flight 837 passengers stuck plane philly gangway cant hear pilot
[(0, 0.5517414824769731), (1, 0.06233491626521308)] americanair already waited hour wanted get home
[(0, 0.186

In [None]:
#sort each vector by score
vecsSort = list(map(lambda i: list(), range(2))) #map() can be applied to more than one list.
for i,docv in enumerate(corpus_lsi):
	for sc in docv:
		isent = (i, abs(sc[1])) #abs() return the absolute value
		vecsSort[sc[0]].append(isent)  # append() adds a single element towards the end of a list.
vecsSort = list(map(lambda x: sorted(x,key=Summarization.takenext,reverse=True), vecsSort))

In [None]:
# sorting the sentence numbers in order
topSentences = Summarization.selectTopSent (10, 2, vecsSort)

In [None]:
print(topSentences)
topSentences.sort()
print(topSentences)

[916, 1740, 361, 1888, 1557, 1178, 82, 1473, 1988, 1429]
[82, 361, 916, 1178, 1429, 1473, 1557, 1740, 1888, 1988]


In [None]:
summary = []
doc = []
cnt = 0
for sentence in document_list:
  doc.append(sentence)
  if cnt in topSentences:
    
    summary.append(sentence)
  cnt += 1    
summary = " ".join(summary)
doc = " ".join(doc)
print("\n")
print("Original:")
print(doc)
print("Summary:")
print(summary)



Original:
Summary:
united 2nd time row ive charged 100s plane ticket united shouldnt check cc everytime united site errored last step changing award cant even pull reservation 60 minute wait time thanks united united cant believe united cant find someone simply check seat back missing passport loved united debating choice united switched fly united delta past two trips disappointed ua4646 223 delayed 3 hours united thanks advice helpful cant believe guys actually charge people fly united united seems like one united help lots fingers pointed zero people stepping least gate oreos united yes amp theyre unsure would comethis never happened airline first time united disappointed united final destination booked united quantas united yet contacted auckland unaware case id number united 1591 wifi broken plane outage 1618 friday southwest get right every plane united cant united horrible attitude staff delay level service respect one expects united
