In [137]:
## Import necessary libraries.

import numpy as np ## Numpy library for creating and modifying arrays.
import pandas as pd ## Pandas library for reading '.csv' files as dataframes.
from nltk.tokenize import sent_tokenize, word_tokenize ## For sentence,word tokenizing.
import re ## For regular expressions.
import string ## For punctuations.
from nltk.corpus import stopwords ## For stop words
from nltk.stem.porter import PorterStemmer ## For getting root word.
from sklearn.model_selection import train_test_split ## For splitting data into train and validation.
import os ## For connecting to local machine to set path for reading files.
from sklearn.metrics import accuracy_score ## For getting accuracy value.
from sklearn.metrics import confusion_matrix,classification_report ## For confusion matrix and TNR,TPR,accuracy.
from keras.preprocessing.text import Tokenizer ## Tokenize the words(text to numeric vector). 
from keras.preprocessing.sequence import pad_sequences ## Adding zeros to vector(Padding here is necessary to make the sequence length same for all messages).
from keras.utils import to_categorical ## For converting the labels to one-hot vectors(Dummies).
from keras.layers import Dense ## For fully connected layer.
from keras.layers import SimpleRNN ## For RNN model.
from keras.layers import LSTM ## For lstm model.
from keras.layers import Embedding ##  For converting each unique token as a vector.
from keras.models import Sequential ## For executing mutiple layers sequentially.
from keras.optimizers import Adam ## For Optimizer(to reduce loss function).
from keras.layers import Bidirectional ## Bidirectional LSTMs are taking context from both forward and backward.

In [2]:
## Get current working directory
os.getcwd()

'D:\\Python\\Pratice\\Identify the Sentiments'

In [110]:
## Set working directory.
os.chdir("D:\DataScience\Pratice\Identify the Sentiments")
os.getcwd()

'D:\\DataScience\\Pratice\\Identify the Sentiments'

In [111]:
## Load train and test data.
train = pd.read_csv('train.csv',header='infer',sep=',',encoding='latin-1')
test = pd.read_csv('test.csv',header='infer',sep=',',encoding='latin-1')

In [112]:
## Check dimensions of train and test data.
print(train.shape)
print(test.shape)

(7920, 3)
(1953, 2)


In [113]:
## Remove URL from text.
def remove_url(text):
    url =  re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [114]:
## Remove html from text.
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

In [115]:
## Remove Emojis.
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [116]:
## Remove punctuatons.
def remove_punctuation(text):
    ## prepare a translation table to replace punctations with empty space.
    translator = str.maketrans('','',string.punctuation)
    ## replace punctations with empty space.
    return text.translate(translator)
    

In [117]:
## Define a list of punctuation marks.
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']

In [118]:
## Replace punctuation marks with whitespace. 
def remove_special_characters(text):
    text = str(text)
    for punct in puncts:
        if punct in text:
            text = text.replace(punct, '')
    return text

In [119]:
## Word tokenization.
def word_tokenization(text):
    return [w.lower() for w in word_tokenize(text)]

In [120]:
## Remove stop words. 
def remove_stopWords(text):
    sw = stopwords.words('english')
    ## get the words which are not there in stop words and convert them into lower case
    return [word.lower() for word in text if word.lower() not in sw]

In [121]:
## Stem/get root words for given text.
def stemProcess(text):
    ## instantiate PoterStemmer
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in text]

In [122]:
## Remove stop words,url,html,emoji,punctuation and do stemming on Train data.
train['tweet'] = train['tweet'].apply(remove_url)
train['tweet'] = train['tweet'].apply(remove_html)
train['tweet'] = train['tweet'].apply(remove_special_characters)
train['tweet'] = train['tweet'].apply(word_tokenize)
train['tweet'] = train['tweet'].apply(remove_stopWords)
train['tweet'] = train['tweet'].apply(stemProcess)

In [123]:
## Remove stop words,url,html,emoji,punctuation and do stemming on Test data.
test['tweet'] = test['tweet'].apply(remove_url)
test['tweet'] = test['tweet'].apply(remove_html)
test['tweet'] = test['tweet'].apply(remove_special_characters)
test['tweet'] = test['tweet'].apply(word_tokenize)
test['tweet'] = test['tweet'].apply(remove_stopWords)
test['tweet'] = test['tweet'].apply(stemProcess)

In [124]:
## Set Index to train and test data.
train.set_index('id',inplace=True)
test.set_index('id',inplace=True)

In [125]:
## Convert list into string.
def convertListToString(temp):
    temp1 =[]
    for i in temp:
        temp1.append(i)
    return " ".join(temp1)

In [126]:
## Convert train tweet column data from list to string.
train['tweet'] = train['tweet'].apply(convertListToString)

In [127]:
## Convert test tweet column data from list to string.
test['tweet'] = test['tweet'].apply(convertListToString)

In [128]:
## Check first record of train data.
train.head(1)

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,fingerprint pregnanc test android app beauti c...


In [129]:
## Check first record of test data.
test.head(1)

Unnamed: 0_level_0,tweet
id,Unnamed: 1_level_1
7921,hate new iphon upgrad wont let download app ug...


In [130]:
## Split data into train and validation (80:20 format).

train_text,valid_text,\
train_target,valid_target= train_test_split(train.drop('label',axis=1),train.drop('tweet',axis=1),
                                          test_size=0.2,random_state=1234)

In [131]:
##  Display dimensions of train and validations.
print(train_text.shape)
print(train_target.shape)
print(valid_text.shape)
print(valid_target.shape)

(6336, 1)
(6336, 1)
(1584, 1)
(1584, 1)


In [132]:
max_num_words = 10000 ## Vocabulary size.
seq_len = 50 # Number of time steps (at each time step one word/word vector is given as input).
embedding_size = 100 # ( the words are embedded into 100 dimensional vector ).

In [133]:
tokenizer = Tokenizer(num_words=max_num_words) ## Tokenizer is used to tokenize text.
tokenizer.fit_on_texts(train_text.tweet) ## Fit this to our corpus.

x_train = tokenizer.texts_to_sequences(train_text.tweet) ## Text to sequences converts the text to a list of indices(Train data).
x_train = pad_sequences(x_train, maxlen=50) ## pad_sequences makes every sequence a fixed size list by padding with 0s. 
x_test = tokenizer.texts_to_sequences(valid_text.tweet)  ## Text to sequences converts the text to a list of indices(test data).
x_test = pad_sequences(x_test, maxlen=50) ## Padding on test data.

x_train.shape, x_test.shape # Check the dimensions of x_train and x_test  

((6336, 50), (1584, 50))

In [134]:
## Check first record of train data.
x_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,  855, 3086,    5,    2,  140, 1477, 3087,  398,  768,
        255,  219,   24, 4688, 2433, 4689])

In [135]:
## Get unique label for train data.
unique_labels = list(train_target.label.unique())
## Display unique labels.
print(unique_labels)
y_train = np.array([unique_labels.index(i) for i in train_target.label]) ## Convert the word labels to indeces(train target).
y_train = to_categorical(y_train) ## Dummify the labels(train data).
y_test = np.array([unique_labels.index(i) for i in valid_target.label]) ## Convert the word labels to indeces(test target).
y_test = to_categorical(y_test) ## Dummify the labels(test data).

[1, 0]


In [136]:
## Call Sequential to initialize a network.
model = Sequential() 
## Add an embedding layer which represents each unique token as a vector.
model.add(Embedding(input_dim = max_num_words,    ## Size of the vocabulary.
                    input_length = seq_len,       ## Length of input sequences, when it is constant.This argument is required if you are going to connect`Flatten` then `Dense` layers upstream(without it, the shape of the dense outputs cannot be computed).
                    output_dim = embedding_size)) ## Dimension of the dense embedding.
## RNN model.
model.add(SimpleRNN(10)) 
## Add an ouput layer. Since classification, 2 nodes for 2 classes.
model.add(Dense(2, activation='softmax')) 

In [68]:
## Check model summary.
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           1000000   
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 10)                1110      
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 22        
Total params: 1,001,132
Trainable params: 1,001,132
Non-trainable params: 0
_________________________________________________________________


In [69]:
## Instantiate optimizer.
adam = Adam(lr=0.001)
## Compile the model.
model.compile(optimizer=adam,                  ## 'Adam' is a variant of gradient descent technique
              loss='categorical_crossentropy', ## categorical_crossentropy for multi-class classification
              metrics=['accuracy'])            ## These metrics are computed for evaluating and stored in history
## Fit a model.
model.fit(x_train,               ## Input data.
          y_train,               ## Target data.
          epochs=1,              ## Number of iterations to show complete train data to model.
          validation_split=0.20) ## train and validation split ratio(80:20).

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4752 samples, validate on 1584 samples
Epoch 1/1


<keras.callbacks.callbacks.History at 0x209cf350e80>

In [73]:
## The next models are an extension of what we discuss above. What we have as cell are a bit complicated
model = Sequential() ## Call Sequential to initialize a network.
## Add an embedding layer which represents each unique token as a vector.
model.add(Embedding(input_dim = max_num_words,   ## Size of the vocabulary.
                    input_length = seq_len, ## Length of input sequences, when it is constant.This argument is required if you are going to connect`Flatten` then `Dense` layers upstream(without it, the shape of the dense outputs cannot be computed).
                    output_dim = embedding_size)) ## Dimension of the dense embedding.
model.add(LSTM(10)) ## LSTM model with 10 dimensionality of the output space. 
## Add an ouput layer. Since classification, 2 nodes for 3 classes.
model.add(Dense(2, activation='softmax'))

In [74]:
## Check model summary.
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 100)           1000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 10)                4440      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 22        
Total params: 1,004,462
Trainable params: 1,004,462
Non-trainable params: 0
_________________________________________________________________


In [76]:
model.compile(optimizer=adam,                  ## 'Adam' is a variant of gradient descent technique
              loss='categorical_crossentropy', ## categorical_crossentropy for multi-class classification
              metrics=['accuracy'])            ## These metrics are computed for evaluating and stored in history

## Fit a model.
model.fit(x_train,               ## Input data.
          y_train,               ## Target data.
          epochs=5,              ## Number of iterations to show complete train data to model.
          validation_split=0.20) ## train and validation split ratio(80:20).

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4752 samples, validate on 1584 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x209ced23048>

In [78]:
## Let's stack RNNs
model = Sequential() ## Call Sequential to initialize a network.
## Add an embedding layer which represents each unique token as a vector.
model.add(Embedding(input_dim = max_num_words,    ## Size of the vocabulary.
                    input_length = seq_len,       ## Length of input sequences, when it is constant.This argument is required if you are going to connect`Flatten` then `Dense` layers upstream(without it, the shape of the dense outputs cannot be computed).
                    output_dim = embedding_size)) ## Dimension of the dense embedding.
model.add(LSTM(10, return_sequences=True)) ## LSTM model with 10 dimensionality of the output space.
model.add(LSTM(5)) ## LSTM model with 5 dimensionality of the output space.
## Add an ouput layer. Since classification, 2 nodes for 2 classes.
model.add(Dense(2, activation='softmax')) 

In [79]:
## Check model summary.
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 100)           1000000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 50, 10)            4440      
_________________________________________________________________
lstm_3 (LSTM)                (None, 5)                 320       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 12        
Total params: 1,004,772
Trainable params: 1,004,772
Non-trainable params: 0
_________________________________________________________________


In [100]:
## Bidirectional LSTMs are taking context from both forward and backward.
model = Sequential() ## Call Sequential to initialize a network.
## Add an embedding layer which represents each unique token as a vector.
model.add(Embedding(input_dim = max_num_words,    ## Size of the vocabulary.
                    input_length = seq_len,       ## Length of input sequences, when it is constant.This argument is required if you are going to connect`Flatten` then `Dense` layers upstream(without it, the shape of the dense outputs cannot be computed).
                    output_dim = embedding_size)) ## Dimension of the dense embedding.
model.add(Bidirectional(LSTM(10)))  ## LSTM model with 10 dimensionality of the output space.
model.add(Dense(2, activation='softmax')) ## Add an ouput layer. Since classification, 2 nodes for 2 classes.

In [101]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 50, 100)           1000000   
_________________________________________________________________
lstm_7 (LSTM)                (None, 10)                4440      
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 22        
Total params: 1,004,462
Trainable params: 1,004,462
Non-trainable params: 0
_________________________________________________________________


In [102]:
model.compile(optimizer=adam,                  ## 'Adam' is a variant of gradient descent technique
              loss='categorical_crossentropy', ## categorical_crossentropy for multi-class classification
              metrics=['accuracy'])            ## These metrics are computed for evaluating and stored in history

## Fit a model.
model.fit(x_train,               ## Input data.
          y_train,               ## Target data.
          epochs=10,              ## Number of iterations to show complete train data to model.
          validation_split=0.20) ## train and validation split ratio(80:20).

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4752 samples, validate on 1584 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x209e58f1e80>

In [103]:
## Get predictions on train data.
preds=model.predict_classes(x_train)
## Get indices of the maximum value.
y_true=np.argmax(y_train,axis=1)
## Get confusion matrix.
confusion_matrix_train = confusion_matrix(y_true,preds)

In [104]:
## Display accuracy value for train data.
print("Train Accuracy :",accuracy_score(y_true,preds))

Train Accuracy : 0.961489898989899


In [105]:
## Get the predictions on validation data.
preds=model.predict_classes(x_test)
## Get indices of the maximum value.
y_true=np.argmax(y_test,axis=1)
## Get confusion matrix.
confusion_matrix_test = confusion_matrix(y_true,preds)

In [106]:
## Display  accuracy value for validation data.
print("Validation Accuracy :",accuracy_score(y_true,preds))

Validation Accuracy : 0.851010101010101
