In [3]:
from string import punctuation
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

Using TensorFlow backend.


In [4]:
from nltk.corpus import stopwords
import string

In [5]:
from string import punctuation
from os import listdir
from collections import Counter

In [6]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens

# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)

# load all docs in a directory
def process_docs(directory, vocab, is_trian):
	# walk through all files in the folder
	for filename in listdir(directory):
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)

# define vocab
vocab = Counter()
# add all docs to vocab
process_docs('C:/Users/ninet/Downloads/IPC section 299/neg', vocab, True)
process_docs('C:/Users/ninet/Downloads/IPC section 299/pos', vocab, True)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

27516
[('accused', 10325), ('The', 8594), ('Section', 7871), ('case', 6355), ('Court', 6225), ('IPC', 6094), ('Singh', 5249), ('State', 4782), ('also', 4473), ('deceased', 4365), ('offence', 3933), ('evidence', 3867), ('death', 3628), ('In', 3559), ('Indian', 3369), ('vs', 3185), ('appellant', 3097), ('It', 2982), ('He', 2959), ('would', 2946), ('prosecution', 2769), ('PW', 2575), ('Kanoon', 2574), ('No', 2537), ('said', 2337), ('person', 2318), ('learned', 2248), ('one', 2184), ('stated', 2139), ('injury', 1949), ('injuries', 1931), ('Of', 1927), ('may', 1915), ('persons', 1899), ('statement', 1888), ('made', 1860), ('house', 1829), ('order', 1726), ('trial', 1721), ('act', 1710), ('found', 1680), ('High', 1625), ('cause', 1582), ('sentence', 1579), ('police', 1572), ('Ram', 1569), ('circumstances', 1565), ('present', 1538), ('time', 1525), ('given', 1518)]


In [7]:
# keep tokens with a min occurrence
min_occurane = 5
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

9568


In [8]:
# save list to file
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()

# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

In [9]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [10]:
# turn a doc into clean tokens
def clean_doc(doc, vocab):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# filter out tokens not in vocab
	tokens = [w for w in tokens if w in vocab]
	tokens = ' '.join(tokens)
	return tokens

In [11]:
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
	documents = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		path = directory + '/' + filename
		# load the doc
		doc = load_doc(path)
		# clean doc
		tokens = clean_doc(doc, vocab)
		# add to list
		documents.append(tokens)
	return documents

# load all training reviews
positive_docs = process_docs('C:/Users/ninet/Downloads/IPC section 299/pos', vocab, True)
negative_docs = process_docs('C:/Users/ninet/Downloads/IPC section 299/neg', vocab, True)
train_docs = positive_docs

In [12]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

In [13]:
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)

In [14]:
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [25]:
# define training labels
ytrain = array([0 for _ in range(55)] + [1 for _ in range(55)])

In [31]:
# load all test reviews
positive_docs = process_docs('C:/Users/ninet/Downloads/IPC section 299/pos', vocab, False)
negative_docs = process_docs('C:/Users/ninet/Downloads/IPC section 299/neg', vocab, False)
test_docs = positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(55)] + [1 for _ in range(55)])

In [27]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

In [28]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 14047, 100)        769800    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 14040, 32)         25632     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 7020, 32)          0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 224640)            0         
_________________________________________________________________
dense_5 (Dense)              (None, 10)                2246410   
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 11        
Total params: 3,041,853
Trainable params: 3,041,853
Non-trainable params: 0
____________________________________________

In [29]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
 - 7s - loss: 0.8877 - accuracy: 0.5545
Epoch 2/10
 - 6s - loss: 0.7224 - accuracy: 0.5091
Epoch 3/10
 - 6s - loss: 0.6747 - accuracy: 0.6818
Epoch 4/10
 - 6s - loss: 0.6466 - accuracy: 0.5727
Epoch 5/10
 - 6s - loss: 0.6189 - accuracy: 0.5364
Epoch 6/10
 - 6s - loss: 0.5699 - accuracy: 0.5818
Epoch 7/10
 - 6s - loss: 0.5155 - accuracy: 0.5727
Epoch 8/10
 - 6s - loss: 0.4605 - accuracy: 0.7000
Epoch 9/10
 - 6s - loss: 0.4075 - accuracy: 0.7636
Epoch 10/10
 - 6s - loss: 0.3754 - accuracy: 0.8909


<keras.callbacks.callbacks.History at 0x21e25eb17f0>

In [32]:
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 98.181820
