## Natural Language Processing For Predicting Sentiment analysis on IMDB dataset Loading from Local
- The model Developed is only based on Embedding Layer, Conv1D, MaxPooling1D and Dense Layers
- The processing methods are simply processing methods

In [None]:
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, Flatten, Dropout
import numpy as np
import nltk
from os import listdir
import string
from collections import Counter

def opener(file_name):
    text = open(file_name)
    content = text.read()
    text.close()
    return content

def preprocess(text : str):
    tokens = text.split()
    stop_words = nltk.corpus.stopwords.words("english")
    table = str.maketrans("", "", string.punctuation)
    tokens = [w.translate(table) for w in tokens if (w not in stop_words and len(w)>1 and w.isalpha())]
    return tokens

def update_vocab(file_name, vocab):
    text = opener(file_name)
    tokens = preprocess(text)
    vocab.update(tokens)
    
def put_all_together(directory):
    doc = []
    for file in listdir(directory):
        if not file.startswith("cv9"):
            path = directory + "/" + file
            content = opener(path)
            tokens = preprocess(content)
            doc.append(tokens)
    return doc
def put_all_together_test(directory):
    doc = []
    for file in listdir(directory):
        if file.startswith("cv9"):
            path = directory + "/" + file
            content = opener(path)
            tokens = preprocess(content)
            doc.append(tokens)
    return doc
            
def save_tokens_to_file(directory, vocab, min_length=2):
    tokens = [k for k,v in vocab.items() if v>min_length]
    l = "\n".join(sorted(tokens))
    f = open(directory, "w")
    f.write((l))
    f.close()

pos = put_all_together("./txt_sentoken/pos")
neg = put_all_together("./txt_sentoken/neg")
train_set = pos + neg
y_train = [1 for _ in range(900)]+[0 for _ in range(900)]
pos_test = put_all_together_test("./txt_sentoken/pos")
neg_test = put_all_together_test("./txt_sentoken/neg")
test_set = pos_test + neg_test
y_test = [1 for _ in range(len(pos_test))] + [0 for _ in range(len(neg_test))]

m_ = max([ len(i) for i in train_set])
y_train = np.array(y_train)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_set)
encoded_senteces = tokenizer.texts_to_sequences(train_set)
padded_sequences = pad_sequences(encoded_senteces, padding="post")
test_set = tokenizer.texts_to_sequences(test_set)
test_sequences = pad_sequences(test_set, maxlen=m_, padding="post")
y_test = np.array(y_test)
v_size = len(tokenizer.word_index) + 1;
model = Sequential()
model.add(Embedding(v_size, 100, input_length= m_))
model.add(Conv1D(32, kernel_size=3, activation="relu", kernel_regularizer="l2"))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dense(32, activation="relu", kernel_regularizer="l2"))
model.add(Dropout(0.4))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()
res = model.fit(padded_sequences, y_train, epochs=10, validation_data=(test_sequences, y_test))

plt.figure(figsize=(10,10))
plt.plot(res.history["accuracy"], label='Acc')
plt.plot(res.history["val_accuracy"], label='Validation Acc')
plt.plot(res.history["loss"], label='Loss')
plt.plot(res.history["val_loss"], label='Validation Loss')
plt.legend()

sent = [["worst" , "moovie" , "ever", "seen"]]
t = tokenizer.texts_to_sequences(sent)
p = pad_sequences(t, maxlen=m_, padding="post")
np.argmax(model.predict(p), axis=-1)

## This is a vectorization of words

In [None]:
from string import punctuation
from os import listdir
from gensim.models import Word2Vec

def load_doc(filename):
	file = open(filename, 'r')
	text = file.read()
	file.close()
	return text

def doc_to_clean_lines(doc, vocab):
	clean_lines = list()
	table = str.maketrans('', '', punctuation)
	lines = doc.splitlines()
	for line in lines:
		tokens = line.split()
		tokens = [w.translate(table) for w in tokens]
		tokens = [w for w in tokens if w in vocab]
		clean_lines.append(tokens)
	return clean_lines

def process_docs(directory, vocab, is_trian):
	lines = list()
	for filename in listdir(directory):
		if is_trian and filename.startswith('cv9'):
			continue
		if not is_trian and not filename.startswith('cv9'):
			continue
		path = directory + '/' + filename
		doc = load_doc(path)
		doc_lines = doc_to_clean_lines(doc, vocab)
		lines += doc_lines
	return lines

vocab_filename = 'tokens.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

positive_docs = process_docs('txt_sentoken/pos', vocab, True)
negative_docs = process_docs('txt_sentoken/neg', vocab, True)
sentences = negative_docs + positive_docs
print('Total training sentences: %d' % len(sentences))

model = Word2Vec(sentences, vector_size=100, window=5, workers=8, min_count=1)
words = list(model.wv.index_to_key)
print('Vocabulary size: %d' % len(words))

filename = 'embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

Total training sentences: 58109
Vocabulary size: 18112
