In [1]:
import os

In [2]:
os.getcwd()

'C:\\Users\\USX28939\\PYTHON_CODE_BASE\\GitHub_Doc\\TensorFlow_Framework'

In [3]:
os.chdir("C:\\Machine Learning\\Hackathons\\Sample Data\\NLP\\Sentiment-Analysis\\txt_sentoken\\")

In [4]:
from nltk.corpus import stopwords
import string
from collections import Counter

In [None]:
def load_doc(filename):

    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [None]:
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens


In [None]:
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
    # load doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # update counts
    vocab.update(tokens)

In [None]:
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
# walk through all files in the folder
    for filename in os.listdir(directory):
        # skip any reviews in the test set
        if is_trian and filename.startswith('cv9'):
                continue
        if not is_trian and not filename.startswith('cv9'):
                continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # add doc to vocab
        add_doc_to_vocab(path, vocab)

In [None]:
# define vocab
vocab = Counter()
# add all docs to vocab
process_docs(os.getcwd() +"\\neg" , vocab, True)
process_docs(os.getcwd() + "\\pos", vocab, True)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

In [None]:
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

In [None]:
# save list to file
def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

In [None]:
# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

In [5]:
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 

In [6]:
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
print(len(vocab))

25767


## Embedding Layer while Training

In [7]:
# turn a doc into clean tokens
def clean_doc_emb(doc, vocab):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

In [20]:
# load all docs in a directory
def process_docs_emb(directory, vocab, is_trian):
    documents = list()
    # walk through all files in the folder
    for filename in os.listdir(directory):
        # skip any reviews in the test set
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load the doc
        #print(path)
        doc = load_doc(path)
        # clean doc
        tokens = clean_doc_emb(doc, vocab)
        # add to list
        documents.append(tokens)
    return documents

In [21]:

# load all training reviews
positive_docs = process_docs_emb(os.getcwd()+"/pos", vocab, True)
negative_docs = process_docs_emb(os.getcwd()+"/neg", vocab, True)
train_docs = negative_docs + positive_docs

In [22]:
train_docs[0]

'plot two teen couples go church party drink drive get accident one guys dies girlfriend continues see life nightmares whats deal watch movie sorta find critique mindfuck movie teen generation touches cool idea presents bad package makes review even harder one write since generally applaud films attempt break mold mess head lost highway memento good bad ways making types films folks didnt snag one correctly seem taken pretty neat concept executed terribly problems movie well main problem simply jumbled starts normal downshifts fantasy world audience member idea whats going dreams characters coming back dead others look like dead strange apparitions disappearances chase scenes tons weird things happen simply explained personally dont mind trying unravel film every give clue get kind fed films biggest problem obviously got big secret hide seems want hide completely final five minutes make things entertaining thrilling even engaging meantime really sad part arrow dig flicks like actually 

In [23]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from numpy import array

In [24]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

In [25]:
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)

In [26]:
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [27]:
Xtrain.shape

(1800, 1317)

In [28]:
# define training labels
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])

In [29]:
ytrain.shape

(1800,)

In [30]:
# load all test reviews
positive_docs_test = process_docs_emb(os.getcwd()+"/pos", vocab, False)
negative_docs_test = process_docs_emb(os.getcwd()+"/neg", vocab, False)
test_docs = negative_docs_test + positive_docs_test
# sequence encode
encoded_docs_test = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

In [36]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [37]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

In [50]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 300, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 1317, 300)         7730400   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 1310, 32)          76832     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 655, 32)           0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 20960)             0         
_________________________________________________________________
dense_9 (Dense)              (None, 50)                1048050   
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 51        
Total params: 8,855,333
Trainable params: 8,855,333
Non-trainable params: 0
_________________________________________________________________


In [51]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=6, verbose=2)

Epoch 1/6
 - 58s - loss: 0.6873 - acc: 0.5556
Epoch 2/6
 - 59s - loss: 0.3132 - acc: 0.8911
Epoch 3/6
 - 61s - loss: 0.0175 - acc: 0.9972
Epoch 4/6
 - 60s - loss: 0.0021 - acc: 1.0000
Epoch 5/6
 - 58s - loss: 0.0012 - acc: 1.0000
Epoch 6/6
 - 59s - loss: 8.4552e-04 - acc: 1.0000


<keras.callbacks.History at 0x1d783fab6d8>

In [49]:
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 84.500000


In [None]:
strr = [["message","Two"],["message","one","is"],["This","is","message","three"]]

In [None]:
t = Tokenizer()

In [None]:
t.fit_on_texts(strr)

In [None]:
t.word_index

In [None]:
t.index_word

In [None]:
t.index_docs

In [None]:
t.word_docs

In [None]:
for w,c in list(t.word_docs.items()):
    print(w, "   ", c)