# Data processing for Sentimental analysis
We create a lexicon $L$ of words. Let $w$ be the no. of words in $L$. For eg.: Let $L$ = chair, table, spoon, television] and we have a new sentence *"I pulled the chair up to the table.*. We represent this sentence using a hot array $C$ of size $w = 4$ where 
$$C[i] = \text{count of } L[i] \text{ in the sentence}$$

So, the new sentence is represented as $[1, 1, 0, 0]$.

Stemming -> Ignores the morphological variant of the words and reduces them to the root word.
Lemmatizing -> Groups together the different words of the same context, even though they don't have the same root word. 

In [18]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import random
import numpy as np
import pickle 
from collections import Counter

## Creating the dataset from text

In [6]:
lemmatizer = WordNetLemmatizer()
n_lines = 10000000
pos, neg = "../data/pos.txt", "../data/neg.txt"

In [75]:
def create_lexicon(pos, neg):
    lexicon = []
    for file in [pos, neg]:
        with open(file, 'r') as f:
            contents = f.readlines()
            for l in contents[:n_lines]:
                all_words = word_tokenize(l)
                lexicon += list(all_words)
                
    lexicon = [lemmatizer.lemmatize(i.lower()) for i in lexicon]
    word_counts = Counter(lexicon)
    l2 = []
    for w in word_counts:
        if 1000 > word_counts[w] > 50:
            l2.append(w)
            
    return l2

def represent_sentence(sentence, lexicon):
    result = np.zeros(len(lexicon))
    
    current_words = word_tokenize(sentence.lower())
    current_words = [lemmatizer.lemmatize(i) for i in current_words]
    for word in current_words:
        if word.lower() in lexicon:
            index_value = lexicon.index(word.lower())
            result[index_value] += 1

    result = list(result)
            
    return result
                
def sample_handling(sample, lexicon, classification):
    featureset = []
    
    with open(sample, 'r') as f:
        contents = f.readlines()
        for l in contents[:n_lines]:
            features = represent_sentence(l, lexicon)
            featureset.append([features, classification])
            
    return featureset

def create_feature_sets_and_labels(pos, neg, lexicon, test_size_ratio=0.1):
    features = []
    features += sample_handling(pos, lexicon, [1, 0])
    features += sample_handling(neg, lexicon, [0, 1])
    random.shuffle(features)
    
    features = np.array(features, dtype=object)
    testing_size = int(test_size_ratio * len(features))
    
    train_x = list(features[:, 0][:-testing_size])
    train_y = list(features[:, 1][:-testing_size])
    train_x = np.array([np.array(i, dtype=np.float64) for i in train_x])
    train_y = np.array([np.array(i, dtype=np.float64) for i in train_y])
    
    test_x = list(features[:, 0][-testing_size:])
    test_y = list(features[:, 1][-testing_size:])
    test_x = np.array([np.array(i, dtype=np.float64) for i in test_x])
    test_y = np.array([np.array(i, dtype=np.float64) for i in test_y])
    
    return train_x, train_y, test_x, test_y
    

In [115]:
# if __name__ == '__main__':
#     train_x, train_y, test_x, test_y = create_feature_sets_and_labels(pos, neg, lexicon)
#     with open('../data/sentiment_set.pickle', 'wb') as f:
#         pickle.dump([train_x, train_y, test_x, test_y], f)

## Applying the neural network on the dataset

In [None]:
import tensorflow as tf

In [112]:
n_nodes_hl1 = 50
n_nodes_hl2 = 20
n_nodes_hl3 = 10

n_classes = 2
batch_size = int(n_lines / 50.0)

In [108]:
lexicon = create_lexicon(pos, neg)

In [109]:
train_x, train_y, test_x, test_y = create_feature_sets_and_labels(pos, neg, lexicon)

In [125]:
model = tf.keras.models.Sequential([
    tf.keras.layers.InputLayer(input_shape=(len(train_x[0]),)),
    tf.keras.layers.Dense(n_nodes_hl1, activation='relu'),
    tf.keras.layers.Dense(n_nodes_hl2, activation='relu'),
    tf.keras.layers.Dense(n_classes),
]
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.CategoricalAccuracy()],
)

model.fit(
    x=train_x, 
    y=train_y,
    epochs=6,
    batch_size=batch_size,
    validation_split=0.2,
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x1ad9e23d4b0>

In [None]:
test_scores = model.evaluate(test_x, test_y, verbose=2)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

In [135]:
unknown = tf.convert_to_tensor(represent_sentence("This is wrong", lexicon))
print(model(test_x[:3]))

tf.Tensor(
[[ 0.02442591 -0.02480735]
 [-0.18129416  0.06998322]
 [-0.09880796  0.08143519]], shape=(3, 2), dtype=float32)


Since, the dataset is very small, the accuracy is not good. 