In [40]:
# loading libraries 
import sys
import nltk
import pandas as pd
from tokenize import tokenize, untokenize, NUMBER, STRING, NAME, OP
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from keras.preprocessing.text import Tokenizer

from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, Dropout, LSTM, SpatialDropout1D
import re
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.wrappers.scikit_learn import KerasClassifier
import numpy
import requests

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1. Load the provided CSV file “Sentiment.csv” and process this file as needed to handle text data.

In [41]:
# loading dataset
dataset = pd.read_csv('/content/Sentiment.csv')
dataset = dataset[['text','sentiment']]
dataset['text'] = dataset['text'].apply(lambda x: x.lower())
dataset['text'] = dataset['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

In [42]:
# iterating thru dataset and replacing data
for idx, row in dataset.iterrows():
    row[0] = row[0].replace('rt', ' ')
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(dataset['text'].values)
X = tokenizer.texts_to_sequences(dataset['text'].values)
X = pad_sequences(X, maxlen=28)
embed_dim = 128
lstm_out = 196
le = LabelEncoder()
# fitting and splitting dataset to train and test
fitted = le.fit_transform(dataset['sentiment'])
y = to_categorical(fitted)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=42)
batch_size = 128


## 2. Build the Keras model that you have in the PPT use case.

In [44]:
# creating sequential model and adding layers
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))
# compiling model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
tb = TensorBoard(log_dir="log/{}", histogram_freq=0, write_graph=True, write_images=True)
model.fit(X_train, Y_train, epochs=5, batch_size=batch_size, verbose=2, callbacks=[tb])

Epoch 1/5
82/82 - 31s - loss: 0.8824 - accuracy: 0.6190 - 31s/epoch - 373ms/step
Epoch 2/5
82/82 - 30s - loss: 0.7316 - accuracy: 0.6846 - 30s/epoch - 361ms/step
Epoch 3/5
82/82 - 28s - loss: 0.6598 - accuracy: 0.7178 - 28s/epoch - 339ms/step
Epoch 4/5
82/82 - 28s - loss: 0.6195 - accuracy: 0.7381 - 28s/epoch - 340ms/step
Epoch 5/5
82/82 - 28s - loss: 0.5932 - accuracy: 0.7500 - 28s/epoch - 339ms/step


<keras.callbacks.History at 0x7fabd7a70110>

## 3. Train and save the model and use the saved model to predict on new text data

In [45]:
model.save('/content/model.h5')
m = load_model('/content/model.h5')
example = [['A lot of good things are happening. We are respected again throughout the world, and thats a great '
         'thing.@realDonaldTrump']]
df = pd.DataFrame(example, index=range(0, 1, 1), columns=list('t'))
df['t'] = df['t'].apply(lambda x: x.lower())
df['t'] = df['t'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['t'].values)
X = tokenizer.texts_to_sequences(df['t'].values)
X = pad_sequences(X, maxlen=28)

output = m.predict(X)
print('Output:', output)
print(numpy.where(max(output[0])), ":", (max(output[0])))
print(numpy.argmax(output))
print(model.summary())

Output: [[0.78973055 0.09900098 0.11126847]]
(array([0]),) : 0.78973055
0
Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 28, 128)           256000    
                                                                 
 spatial_dropout1d_4 (Spatia  (None, 28, 128)          0         
 lDropout1D)                                                     
                                                                 
 lstm_5 (LSTM)               (None, 196)               254800    
                                                                 
 dense_5 (Dense)             (None, 3)                 591       
                                                                 
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________
None


## 4. Apply the code on spam data set available in the source code (text classification on the spam.csv data set)


In [46]:
# loading spam.csv
spam_data = pd.read_csv('/content/spam.csv', encoding='latin-1')
spam_data = spam_data[['v2', 'v1']]
spam_data['v2'] = spam_data['v2'].apply(lambda x: x.lower())
spam_data['v2'] = spam_data['v2'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures)
tokenizer.fit_on_texts(spam_data['v2'].values)
X = tokenizer.texts_to_sequences(spam_data['v2'].values)
X = pad_sequences(X)
embed_dim = 128
lstm_out = 196
# creating model
def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim, input_length=X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
# fitting model
integer_encoded = le.fit_transform(spam_data['v1'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=42)
batch_size = 32
model = createmodel()
model.fit(X_train, Y_train, batch_size=batch_size, verbose=2)
score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size)
print(model.metrics_names[0], ":", score)
print(model.metrics_names[1], ":", acc)

117/117 - 100s - loss: 0.1603 - accuracy: 0.9445 - 100s/epoch - 853ms/step
58/58 - 6s - loss: 0.0869 - accuracy: 0.9777 - 6s/epoch - 109ms/step
loss : 0.0869411751627922
accuracy : 0.9777052998542786
