In [1]:
import pandas as pd 
import json 
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import gensim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf 
from keras.callbacks import EarlyStopping
import tensorflow_hub as hub 
import tensorflow_text as text
from tensorflow.keras import optimizers

In [2]:
df = pd.read_csv('IBC.csv', index_col=False)
df

Unnamed: 0,SENTENCE,LABEL
0,Forcing middle-class workers to bear a greater...,1
1,Because it would not be worthwhile to bring a ...,1
2,"Indeed , Lind argues that high profits and hig...",1
3,"In fairness , it should be noted that he devot...",1
4,Psychological tactics are social control techn...,1
...,...,...
4321,"As Doug Ogden , former director of the Energy ...",0
4322,No study is perfect ; each one is subject to c...,0
4323,"Of course , market forces , the balancing of ,...",0
4324,"In Words Like Loaded Pistols , he sets out to ...",0


In [3]:
stop = stopwords.words('english')
df['SENTENCE'] = df['SENTENCE'].str.replace(r'[^\w\s]+', '')
df['SENTENCE'] = df['SENTENCE'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))


  df['SENTENCE'] = df['SENTENCE'].str.replace(r'[^\w\s]+', '')


In [7]:
stemmer = PorterStemmer()
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

df['SENTENCE'] = df['SENTENCE'].apply(stem_sentences)

In [10]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['SENTENCE'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 10477 unique tokens.


In [12]:
X = tokenizer.texts_to_sequences(df['SENTENCE'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (4326, 250)


In [13]:
Y = pd.get_dummies(df['LABEL']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (4326, 3)


In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(3893, 250) (3893, 3)
(433, 250) (433, 3)


In [20]:

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(tf.keras.layers.SpatialDropout1D(0.2))
model.add(tf.keras.layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(tf.keras.layers.Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 100)          5000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 250, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 3)                 303       
                                                                 
Total params: 5,080,703
Trainable params: 5,080,703
Non-trainable params: 0
_________________________________________________________________
