# Word Embedding Model

The architecture used for sentiment analysis is "Word Embeddings" whose guide can be viewed at the following link:
>https://www.tensorflow.org/text/guide/word_embeddings

## Creating Class

In [1]:
# class Sentiment():
#     Positive = 'Positive'
#     Negative = 'Negative'

class Tweet():
    def __init__(self, text, label):
        self.text = text
        self.label = label
#         self.sentiment = self.get_sentiment()
        
#     def get_sentiment(self):
#         if self.label == 0:
#             return Sentiment.Negative
#         else:
#             return Sentiment.Positive
        

class Utils():
    def __init__(self, tweets):
        self.tweets = tweets
        
    def get_text(self):
        return [x.text for x in self.tweets]
    
#     def get_sentiment(self):
#         return [x.sentiment for x in self.tweets]
    
    def get_label(self):
        return [x.label for x in self.tweets]

### Imports

In [2]:
import tensorflow as tf
import json
import numpy as np

from tensorflow.keras import losses
from tensorflow.keras import utils
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization
from sklearn.model_selection import train_test_split

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

2022-02-17 17:40:44.699112: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-17 17:40:44.699152: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Process Data

### Read data from json.file

In [3]:
file_name = '../data/Data_processed.json'

tweets = []
with open(file_name) as f:
    for line in f:
        tweet = json.loads(line)
        tweets.append(Tweet(tweet['Text'], tweet['Target']))
    
# Taking a look at an example of our data
print(tweets[0].text)
print(tweets[0].label)
# print(tweets[0].sentiment)

   awww thats a bummer  you shoulda got david carr of third day to do it d
0


## Creating our Tensorflow model

### Setting Hyper-parameters

In [4]:
BATCH_SIZE = 1024
SEED = 123
DENSE_NODES = 16
OPTIMIZER = 'adam'
METRICS = ['accuracy']
EPOCHS = 5
VOCAB_SIZE = 10000
SEQUENCE_LEN = 50
EMBEDDING_DIM = 16

## Creating test/train splits

In [5]:
dataset_text = Utils(tweets).get_text()
dataset_labels = Utils(tweets).get_label()

ds_labels = tf.convert_to_tensor(dataset_labels)
ds_text = tf.convert_to_tensor(dataset_text)

2022-02-17 17:40:53.775940: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-02-17 17:40:53.775985: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-02-17 17:40:53.776004: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (mik-HP-EliteBook-840-G2): /proc/driver/nvidia/version does not exist
2022-02-17 17:40:53.776272: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


For creating our TextVectorizer Vocab(Embedding vector)

In [6]:
p_text = tf.data.Dataset.from_tensors(ds_text)

## Text Vectorization


Use the text vectorization layer to normalize, split, and map strings to integers. Note that the layer uses the custom standardization defined above.Set maximum_sequence length as all samples are not of the same length.

In [7]:
vectorize_layer = TextVectorization(standardize='lower_and_strip_punctuation',
                                   max_tokens=VOCAB_SIZE,
                                   split='whitespace',
                                   output_mode='int',
                                   output_sequence_length=SEQUENCE_LEN)

vectorize_layer.adapt(p_text)

2022-02-17 17:40:56.779380: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 316989920 exceeds 10% of free system memory.
2022-02-17 17:40:56.779452: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 475484880 exceeds 10% of free system memory.


Calling adapt mathod to build vocabulary from training dataset while also transforming our test dataset for future.

# Model

## Create Model

In [8]:
model = Sequential([
    vectorize_layer,
    Embedding(VOCAB_SIZE, EMBEDDING_DIM, name='embedding'),
    GlobalAveragePooling1D(),
    Dense(DENSE_NODES, activation='relu'),
    Dense(1, activation='sigmoid')
])


## Compile and train model

In [9]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='logs')

tf.keras.losses.BinaryCrossentropy

model.compile(optimizer=OPTIMIZER,
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
             metrics=METRICS)

model.fit(x=ds_text,
         y=ds_labels,
         batch_size=BATCH_SIZE,
         epochs=EPOCHS, 
         validation_split=0.1,
         callbacks=[tensorboard_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3e9030a160>

## Visualize model on tensorboard

In [10]:
%load_ext tensorboard
%tensorboard --logdir logs

ERROR: Could not find `tensorboard`. Please ensure that your PATH
contains an executable `tensorboard` program, or explicitly specify
the path to a TensorBoard binary by setting the `TENSORBOARD_BINARY`
environment variable.