In [1]:
import os

In [2]:
os.getcwd()

'C:\\Users\\USX28939\\PYTHON_CODE_BASE\\GitHub_Doc\\TensorFlow_Framework'

In [3]:
os.chdir("C:\\Machine Learning\\Hackathons\\Sample Data\\NLP\\Sentiment-Analysis\\txt_sentoken\\")

In [4]:
from nltk.corpus import stopwords
import string
from collections import Counter

In [None]:
def load_doc(filename):

    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [None]:
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens


In [None]:
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
    # load doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # update counts
    vocab.update(tokens)

In [None]:
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
# walk through all files in the folder
    for filename in os.listdir(directory):
        # skip any reviews in the test set
        if is_trian and filename.startswith('cv9'):
                continue
        if not is_trian and not filename.startswith('cv9'):
                continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # add doc to vocab
        add_doc_to_vocab(path, vocab)

In [None]:
# define vocab
vocab = Counter()
# add all docs to vocab
process_docs(os.getcwd() +"\\neg" , vocab, True)
process_docs(os.getcwd() + "\\pos", vocab, True)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

In [None]:
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

In [None]:
# save list to file
def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

In [None]:
# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

In [None]:
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [None]:
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
print(len(vocab))

## Embedding Layer while Training

In [None]:
# turn a doc into clean tokens
def clean_doc_emb(doc, vocab):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

In [None]:
# load all docs in a directory
def process_docs_emb(directory, vocab, is_trian):
    documents = list()
    # walk through all files in the folder
    for filename in os.listdir(directory):
        # skip any reviews in the test set
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load the doc
        #print(path)
        doc = load_doc(path)
        # clean doc
        tokens = clean_doc_emb(doc, vocab)
        # add to list
        documents.append(tokens)
    return documents

In [None]:

# load all training reviews
positive_docs = process_docs_emb(os.getcwd()+"/pos", vocab, True)
negative_docs = process_docs_emb(os.getcwd()+"/neg", vocab, True)
train_docs = negative_docs + positive_docs

In [None]:
train_docs[0]

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from numpy import array

In [None]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

In [None]:
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)

In [None]:
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [None]:
Xtrain.shape

In [None]:
# define training labels
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])

In [None]:
ytrain.shape

In [None]:
# load all test reviews
positive_docs_test = process_docs_emb(os.getcwd()+"/pos", vocab, False)
negative_docs_test = process_docs_emb(os.getcwd()+"/neg", vocab, False)
test_docs = negative_docs_test + positive_docs_test
# sequence encode
encoded_docs_test = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [None]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 300, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

In [None]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=6, verbose=2)

In [None]:
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

## Train word2vec Embedding

In [5]:
# load doc into memory
def load_doc_learn_emb(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc_learn_emb(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [13]:
# turn a doc into clean tokens
def doc_to_clean_lines_learn_emb(doc, vocab):
    clean_lines = list()
    lines = doc.splitlines()
    for line in lines:
        # split into tokens by white space
        tokens = line.split()
        # remove punctuation from each token
        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]
        # filter out tokens not in vocab
        tokens = [w for w in tokens if w in vocab]
        clean_lines.append(tokens)
    return clean_lines

In [14]:
def process_docs_learn_emb(directory, vocab, is_trian):
    lines = list()
    # walk through all files in the folder
    for filename in os.listdir(directory):
        # skip any reviews in the test set
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        doc = load_doc_learn_emb(path)
        doc_lines = doc_to_clean_lines_learn_emb(doc, vocab)
        # add lines to list
        lines += doc_lines
    return lines


In [16]:
# load training data
positive_lines_learn_emb = process_docs_learn_emb(os.getcwd()+"/pos", vocab, True)
negative_lines_learn_emb = process_docs_learn_emb(os.getcwd()+"/neg", vocab, True)
sentences_learn_emb = negative_lines_learn_emb + positive_lines_learn_emb
print('Total training sentences: %d' % len(sentences_learn_emb))

Total training sentences: 58109


In [17]:
from gensim.models import Word2Vec



In [18]:
# train word2vec model
model = Word2Vec(sentences_learn_emb, size=100, window=5, workers=8, min_count=1)
# summarize vocabulary size in model
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))

Vocabulary size: 25767


In [28]:
model.wv.word_vec

<bound method WordEmbeddingsKeyedVectors.word_vec of <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x0000020899C04F98>>

In [29]:
# save model in ASCII (word2vec) format
filename = 'embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [35]:
model.wv.word_vec("film")

array([ 5.10827219e-03, -3.19574326e-01, -4.46447819e-01, -7.33830154e-01,
        4.60318118e-01,  4.38042060e-02,  7.80621991e-02, -6.76220596e-01,
       -2.59524703e-01,  6.21454008e-02, -5.39041400e-01,  7.53559843e-02,
        1.66830540e-01, -5.62897064e-02,  1.71294883e-01,  8.56710076e-01,
       -1.91785291e-01, -1.27226830e+00, -3.57031226e-01, -3.01564932e-01,
       -4.09420729e-01, -6.27254009e-01, -2.26020411e-01,  8.41810226e-01,
        2.70500779e-01,  3.74517471e-01, -5.20393133e-01,  7.20657587e-01,
       -6.46927580e-02, -1.14747705e-02,  1.20023322e+00,  1.37368536e+00,
       -3.27591360e-01,  6.51843846e-02, -7.49423325e-01, -8.27073827e-02,
       -1.28402483e+00,  4.96836275e-01,  6.08083010e-01,  2.96842664e-01,
        8.76195550e-01, -1.31210923e+00, -1.93732485e-01, -6.30657196e-01,
       -4.61279094e-01, -7.14728296e-01, -9.84488172e-04, -9.40896451e-01,
        1.25906444e+00,  4.58537519e-01,  2.30384395e-01, -3.00351739e-01,
        2.75928508e-02, -

In [57]:
model.wv.similarity("real","onto")

  if np.issubdtype(vec.dtype, np.int):


0.8688705

In [55]:
model.wv["span"]

array([ 0.00116621, -0.05769543, -0.02252658, -0.06757847,  0.06238534,
        0.00583863,  0.00609647, -0.10794933,  0.04661379, -0.0425207 ,
       -0.0392459 ,  0.09735174,  0.03274011,  0.12021565,  0.06646618,
        0.13370278,  0.05293754, -0.19335492, -0.11360101,  0.01460008,
       -0.01227232, -0.04636946,  0.01337749,  0.05388087,  0.11673507,
        0.04270085, -0.08969457,  0.0494533 , -0.01152557, -0.01477933,
        0.10234531,  0.08129044, -0.04764746,  0.04954107, -0.14059196,
       -0.00333981, -0.18403947,  0.00128189,  0.06752554, -0.09766366,
        0.14796919, -0.23662058, -0.01227746, -0.12906088, -0.11872869,
       -0.08507252,  0.0703171 , -0.15944426,  0.04296328,  0.10875189,
        0.06732283, -0.02493776, -0.11622039, -0.04095735, -0.0436756 ,
       -0.05527456,  0.03296762, -0.0813996 , -0.00624743,  0.04268681,
       -0.08325578, -0.00210014,  0.03941156, -0.08560324,  0.16393957,
        0.02726694,  0.07660981,  0.15182626, -0.14603102,  0.02

In [None]:
strr = [["message","Two"],["message","one","is"],["This","is","message","three"]]

In [None]:
t = Tokenizer()

In [None]:
t.fit_on_texts(strr)

In [None]:
t.word_index

In [None]:
t.index_word

In [None]:
t.index_docs

In [None]:
t.word_docs

In [None]:
for w,c in list(t.word_docs.items()):
    print(w, "   ", c)