In [1]:
from keras.models import Sequential
from keras import layers
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Flatten
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from string import punctuation
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
#importing the nltk libraries for preprocessing of the Reviews text field of the IMDB Dataset.
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from textblob import Word 
import csv
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


1. Preprocessing

In [5]:
# reading the sentiment dataset csv
data=pd.read_csv('/content/sample_data/Sentiment.csv')
# keeping only the neccessary columns 
data = data[['text','sentiment']] 

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13871 entries, 0 to 13870
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       13871 non-null  object
 1   sentiment  13871 non-null  object
dtypes: object(2)
memory usage: 216.9+ KB


In [7]:
# converting text to lower case letters
data['text'] = data['text'].apply(lambda x: x.lower())
# removing punctuation that is not needed 
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x))) 

# removing rt string from data
for idx, row in data.iterrows(): 
  row[0] = row[0].replace('rt',' ') 

# setting the number of features
max_features = 2000 

# tokenizing the data
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values) 

# converting text to sequences
X = tokenizer.texts_to_sequences(data['text'].values) 
# padding the data setting them to the same size
X = pad_sequences(X)

2. Building the Keras Model

In [8]:
# embedding dimensions
embed_dim = 128
# out features of lstm
lstm_out = 196 
# creating the sequential model
model = Sequential()
# Embedding layer
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1])) 
# LSTM layer
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) 
# Dense layer
model.add(Dense(3,activation='softmax')) 
# compiling the model
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) 
# printing summary of the model
(model.summary()) 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 28, 128)           256000    
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 3)                 591       
                                                                 
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________


In [9]:
# performing label encoding 
Labelencoder = preprocessing.LabelEncoder()
# fit transforming sentiment data
integer_encoded = Labelencoder.fit_transform(data['sentiment'])
# converting class vector to binary class matrix.
Y = to_categorical(integer_encoded)

In [10]:
# creating test and train data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42) 
(X_train.shape,Y_train.shape) 
(X_test.shape,Y_test.shape) 

((4578, 28), (4578, 3))

In [11]:
# setting batch size
batch_size = 32
# fitting the model
model.fit(X_train, Y_train, epochs = 20, batch_size=batch_size, verbose = 2) 

Epoch 1/20
291/291 - 71s - loss: 0.8245 - accuracy: 0.6438 - 71s/epoch - 244ms/step
Epoch 2/20
291/291 - 87s - loss: 0.6789 - accuracy: 0.7120 - 87s/epoch - 298ms/step
Epoch 3/20
291/291 - 80s - loss: 0.6142 - accuracy: 0.7400 - 80s/epoch - 274ms/step
Epoch 4/20
291/291 - 63s - loss: 0.5666 - accuracy: 0.7657 - 63s/epoch - 216ms/step
Epoch 5/20
291/291 - 60s - loss: 0.5223 - accuracy: 0.7837 - 60s/epoch - 207ms/step
Epoch 6/20
291/291 - 62s - loss: 0.4824 - accuracy: 0.8021 - 62s/epoch - 212ms/step
Epoch 7/20
291/291 - 62s - loss: 0.4425 - accuracy: 0.8160 - 62s/epoch - 214ms/step
Epoch 8/20
291/291 - 62s - loss: 0.4089 - accuracy: 0.8345 - 62s/epoch - 212ms/step
Epoch 9/20
291/291 - 63s - loss: 0.3809 - accuracy: 0.8439 - 63s/epoch - 215ms/step
Epoch 10/20
291/291 - 63s - loss: 0.3505 - accuracy: 0.8645 - 63s/epoch - 216ms/step
Epoch 11/20
291/291 - 62s - loss: 0.3298 - accuracy: 0.8661 - 62s/epoch - 212ms/step
Epoch 12/20
291/291 - 62s - loss: 0.3097 - accuracy: 0.8750 - 62s/epoch - 

<keras.callbacks.History at 0x7fba8c5ca7d0>

In [12]:
# saving model
model.save('Sentimentmodel.h5')

In [13]:
# evaluation
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size) 
("score: %.2f" % (score)) 
("acc: %.2f" % (acc)) 

144/144 - 3s - loss: 1.9089 - accuracy: 0.6481 - 3s/epoch - 19ms/step


'acc: 0.65'

We got an accuracy of 0.65

3. Loading the model after saving and predicting

In [16]:
# loading previously created model
model = load_model('/content/Sentimentmodel.h5')



In [34]:
# using ICP example
text=[["A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump"]]
# creating dataframe
newdata=pd.DataFrame(text,index=range(0,1,1),columns=list('1'))

In [35]:
newdata

Unnamed: 0,1
0,A lot of good things are happening. We are res...


In [39]:
# converting text to lower case letters
newdata['1'] = newdata['1'].apply(lambda x: x.lower())
# removing punctuation that is not needed 
newdata['1'] = newdata['1'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x))) 

# setting the number of features
max_features = 2000 

# tokenizing the data
tokenizer = Tokenizer(num_words=max_features, split=' ') 
tokenizer.fit_on_texts(newdata['1'].values) 

# converting text to sequences
X = tokenizer.texts_to_sequences(newdata['1'].values) 
# padding the data setting them to the same size and shape
X=pad_sequences(X,maxlen=28)

In [42]:
result = model.predict(X)
print('result', result)
print(np.where(max(result[0])),':',max(result[0]))

result [[0.7649572  0.0177965  0.21724637]]
(array([0]),) : 0.7649572


Result shows positive sentiment

4. Using spam data

In [84]:
# reading spam data
data=pd.read_csv('/content/sample_data/spam.csv',encoding='latin-1')
# keeping only the neccessary columns 
data=data[['v2','v1']]

In [85]:
# converting text to lower case letters
data['v2'] = data['v2'].apply(lambda x: x.lower()) 
# removing punctuation that is not needed 
data['v2'] = data['v2'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x))) 

# setting the number of features
max_features = 2000

# tokenizing the data
tokenizer = Tokenizer(num_words=max_features, split=' ') 
tokenizer.fit_on_texts(data['v2'].values) 

# converting text to sequences
X = tokenizer.texts_to_sequences(data['v2'].values) 
# padding the data setting them to the same size and shape
X=pad_sequences(X)

In [86]:
# embedding dimensions
embed_dim=128
# out features of lstm
lstm_out=196

# creating the sequential model
modelnew = Sequential()
# Embedding layer
modelnew.add(Embedding(max_features, embed_dim,input_length = X.shape[1])) 
# LSTM layer
modelnew.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) 
# Dense layer
modelnew.add(Dense(2,activation='sigmoid'))
# compiling the model
modelnew.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
# printing summary of the model
modelnew.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 152, 128)          256000    
                                                                 
 lstm_7 (LSTM)               (None, 196)               254800    
                                                                 
 dense_7 (Dense)             (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [87]:
# performing label encoding 
labelencoder = preprocessing.LabelEncoder()
# fit transforming spam2 data
integer_encoded = labelencoder.fit_transform(data['v1'])
# converting class vector to binary class matrix.
Y= to_categorical(integer_encoded)

In [88]:
# creating test and train data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42) 
(X_train.shape,Y_train.shape) 
(X_test.shape,Y_test.shape) 

((1839, 152), (1839, 2))

In [89]:
# setting batch size
batch_size = 32 
# fitting the model
modelnew.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)

Epoch 1/7
117/117 - 166s - loss: 0.1966 - accuracy: 0.9352 - 166s/epoch - 1s/step
Epoch 2/7
117/117 - 125s - loss: 0.0476 - accuracy: 0.9858 - 125s/epoch - 1s/step
Epoch 3/7
117/117 - 126s - loss: 0.0206 - accuracy: 0.9938 - 126s/epoch - 1s/step
Epoch 4/7
117/117 - 126s - loss: 0.0205 - accuracy: 0.9933 - 126s/epoch - 1s/step
Epoch 5/7
117/117 - 126s - loss: 0.0098 - accuracy: 0.9971 - 126s/epoch - 1s/step
Epoch 6/7
117/117 - 128s - loss: 0.0047 - accuracy: 0.9987 - 128s/epoch - 1s/step
Epoch 7/7
117/117 - 127s - loss: 0.0027 - accuracy: 0.9989 - 127s/epoch - 1s/step


<keras.callbacks.History at 0x7fb9fdfdc150>

In [90]:
# evaluation
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size) 
("score: %.2f" % (score)) 
("acc: %.2f" % (acc)) 

58/58 - 4s - loss: 0.6929 - accuracy: 0.5291 - 4s/epoch - 72ms/step


'acc: 0.53'

Accuracy returned is 0.52