# Importing libraries

In [1]:
##importing libraries

##data manipulation
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import re
import string

##methods and stopwords text preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from emot.emo_unicode import UNICODE_EMOJI # For emojis
import pickle

##ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

In [2]:
##Creating stopwords set
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\manah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\manah\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Loading file

In [3]:
def load_dataset(filepath):
    """
    reads the CSV file to return a 
    dataframe with specified column names
    """
    df = pd.read_csv(filepath)
    return df

df = load_dataset("airline_sentiment_analysis.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,airline_sentiment,text
0,1,positive,@VirginAmerica plus you've added commercials t...
1,3,negative,@VirginAmerica it's really aggressive to blast...
2,4,negative,@VirginAmerica and it's a really big bad thing...
3,5,negative,@VirginAmerica seriously would pay $30 a fligh...
4,6,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [4]:
def del_unwanted_cols(df, cols):
    """
    Deletes unwanted columns from dataframe
    """
    for col in cols:
        df.drop(col, axis=1, inplace=True)
    return df

df = del_unwanted_cols(df, ['Unnamed: 0'])
df.head()

Unnamed: 0,airline_sentiment,text
0,positive,@VirginAmerica plus you've added commercials t...
1,negative,@VirginAmerica it's really aggressive to blast...
2,negative,@VirginAmerica and it's a really big bad thing...
3,negative,@VirginAmerica seriously would pay $30 a fligh...
4,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [5]:
def convert_categorical(df, col):
    """
    Convert positive to 1 and negative to 0
    """
    dummy = pd.get_dummies(df[col])
    df2 = pd.concat((df,dummy), axis=1)
    df2.drop(col, axis=1, inplace=True)
    df2.drop("negative", axis=1, inplace=True)
    df2.rename({'positive':'sentiment'},axis=1, inplace=True)
    return df2

df = convert_categorical(df, 'airline_sentiment')
df.head()

Unnamed: 0,text,sentiment
0,@VirginAmerica plus you've added commercials t...,1
1,@VirginAmerica it's really aggressive to blast...,0
2,@VirginAmerica and it's a really big bad thing...,0
3,@VirginAmerica seriously would pay $30 a fligh...,0
4,"@VirginAmerica yes, nearly every time I fly VX...",1


# Preprocessing text

In [6]:
UNICODE_EMO = UNICODE_EMOJI
# Function for converting emojis into word
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
    return text
# Example
text1 = "Hilarious 😂. The feeling of making a sale 😎, The feeling of actually fulfilling orders 😒"
convert_emojis(text1)

'Hilarious face_with_tears_of_joy. The feeling of making a sale smiling_face_with_sunglasses, The feeling of actually fulfilling orders unamused_face'

In [7]:
def preprocess_message(message):
    """
    Runs a set of transformational steps to preprocess
    the text of the message
    """
    
    #Casing -> convert text to lowercase
    message = message.lower()
    
    #Denoising -> remove urls
    message = re.sub(r"http\S+|www\S+|https\S+", "", message, flags=re.MULTILINE)
    
    #Denoising -> remove emojis
    message = convert_emojis(message)
    
    #Denoising -> remove punctuations
    message = message.translate(str.maketrans("", "", string.punctuation))
    
    #Denoising -> remove @ and # from message
    message = re.sub(r"@[a-z0-9]+", "", message)
    message = re.sub(r"#", "", message)
    
    #Denoising -> remove RT
    message = re.sub(r"RT[\s]+", "", message)
    
    #Tokenization and stop words removal
    message_tokens = word_tokenize(message)
    filtered_words = [word for word in message_tokens if word not in stop_words]
    
    #Text normalization -> stemming
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]
    
    #Text normalization -> lemmatization
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    return " ".join(lemma_words)

#preprocess_message("Hey there, how are you preparing for exams?")

In [8]:
df['text'] = df['text'].apply(preprocess_message)
df.head()

Unnamed: 0,text,sentiment
0,virginamerica plu youv ad commerci experi tacki,1
1,virginamerica realli aggress blast obnoxi ente...,0
2,virginamerica realli big bad thing,0
3,virginamerica serious would pay 30 flight seat...,0
4,virginamerica ye nearli everi time fli vx “ ea...,1


# Train test split

In [9]:
X=df['text'].values
Y=df['sentiment'].values
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size=0.3)

# TF-IDF Vectorizer

In [33]:
# TF-IDF Vectorizer
vec = TfidfVectorizer()
vec.fit(X_train)
pickle.dump(vec, open("vectorizer.pickle", "wb"))
x_train=vec.transform(X_train)
x_test=vec.transform(X_test)

# Logistic Regression

In [34]:
# Logistic Regression

# fit the training dataset
lr = LogisticRegression()
lr.fit(x_train, Y_train)

# predict the sentiment on validation dataset
predictions_LR = lr.predict(x_test)

# Use accuracy_score function to get the accuracy
print("Logistic Regression Accuracy Score -> ",accuracy_score(predictions_LR, Y_test)*100)

Logistic Regression Accuracy Score ->  91.0771007796708


# Naive Bayes Algorithm 

In [35]:
#Naive Bayes Classifier Algorithm

# fit the training dataset on the NB classifier
Naive = MultinomialNB()
Naive.fit(x_train, Y_train)

# predict the sentiment on validation dataset
predictions_NB = Naive.predict(x_test)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Y_test)*100)

Naive Bayes Accuracy Score ->  84.31995379728559


# Support Vector Machine

In [36]:
# Classifier - Algorithm - SVM

# fit the training dataset on the classifier
SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(x_train, Y_train)

# predict the sentiment on validation dataset
predictions_SVM = SVM.predict(x_test)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Y_test)*100)

with open('model_pickle','wb') as f:
    pickle.dump(SVM,f)

SVM Accuracy Score ->  92.00115506786024


In [37]:
with open('model_pickle','rb') as f:
    mp = pickle.load(f)

In [41]:
msg = "Hey airline attendant, how are you? Nice to meet you! 😃"
msg = preprocess_message(msg)
msg = [msg]
vectorizer = pickle.load(open("vectorizer.pickle", "rb"))
msg = vectorizer.transform(msg)
#msg = msg[0]
mp.predict(msg)[0]

1

# Training own embedding

In [15]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

In [16]:
x_train = tokenizer.texts_to_sequences(X_train)
x_test = tokenizer.texts_to_sequences(X_test)

In [17]:
vocab = len(tokenizer.word_index) + 1
from keras_preprocessing.sequence import pad_sequences
maxlen = 100
x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense, Activation, MaxPool1D, Flatten
from tensorflow.keras.optimizers import Adam
emb_dim=100
model= Sequential()
model.add(Embedding(input_dim=vocab, output_dim=emb_dim, input_length=maxlen))
model.add(MaxPool1D())
model.add(Dense(16,activation="relu"))
model.add(Dense(16,activation="relu"))
model.add(Dense(1, activation='sigmoid'))
model.add(Flatten())
model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          885200    
                                                                 
 max_pooling1d (MaxPooling1D  (None, 50, 100)          0         
 )                                                               
                                                                 
 dense (Dense)               (None, 50, 16)            1616      
                                                                 
 dense_1 (Dense)             (None, 50, 16)            272       
                                                                 
 dense_2 (Dense)             (None, 50, 1)             17        
                                                                 
 flatten (Flatten)           (None, 50)                0         
                                                        

In [20]:
history = model.fit(x_train, Y_train,epochs=35,verbose=True,batch_size=16)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


In [82]:
test_score=model.evaluate(x_test,Y_test)
test_score



[0.5537469387054443, 0.1582442969083786]

In [84]:
#It has only 15.82% accuracy