<a href="https://colab.research.google.com/github/MrXisOnline/Models-with-ML-DL/blob/master/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Loading Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis
!unzip /content/twitter-entity-sentiment-analysis.zip

Mounted at /content/drive
Downloading twitter-entity-sentiment-analysis.zip to /content
  0% 0.00/1.99M [00:00<?, ?B/s]
100% 1.99M/1.99M [00:00<00:00, 153MB/s]
Archive:  /content/twitter-entity-sentiment-analysis.zip
  inflating: twitter_training.csv    
  inflating: twitter_validation.csv  


### Process Data

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential
import matplotlib.pyplot as plt
import sklearn

In [None]:
train_df = pd.read_csv("/content/twitter_training.csv")
train_df.drop(train_df.columns[:2], axis=1, inplace=True)
train_df.rename(columns={train_df.columns[0]: 'sentiment', train_df.columns[1]: 'tweet'}, 
                inplace=True, errors='raise')
train_df.dropna(inplace=True)

In [None]:
simple_train_df = train_df[(train_df["sentiment"] == "Positive") | (train_df["sentiment"] == "Negative")]

In [None]:
sentiment_map = {k:i for i, k in enumerate(list(dict(simple_train_df["sentiment"].value_counts()).keys()))}

In [None]:
simple_train_df.sentiment = [sentiment_map[i] for i in simple_train_df.sentiment]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [None]:
tweets = simple_train_df.tweet.to_numpy()
labels = simple_train_df.sentiment.to_numpy()

In [None]:
tweets.shape, labels.shape

((43012,), (43012,))

In [None]:
def batch_tweets(data, labels):
    ndata = np.empty((0,), dtype=object)
    for i, tweet in enumerate(data):
        ndata = np.append(ndata, ["<sos> " + tweet], axis=0)
    return ndata[:-(len(ndata)%32)].reshape(len(ndata)//32, 32), labels[:-(len(labels)%32)].reshape(len(labels)//32, 32)

In [None]:
max_sentence_length = 0
def preprocess_string(string):
    global max_sentence_length
    string = tf.strings.substr(string, 0, 300)
    # string = tf.strings.regex_replace(string, b"<br\\s*/?", b" ")
    string = tf.strings.regex_replace(string, b"[^\P{P}<>]+", b" ")
    string = tf.strings.split(string)
    string = tf.strings.regex_replace(string, b" ", b"<pad>")
    string = string.to_tensor(default_value=b"<pad>")
    if string.shape[-1] > max_sentence_length:
        max_sentence_length = string.shape[-1]
    return string.numpy()

In [None]:
def tweet_shaper(tweet, length):
    s_array_length = length - len(tweet)
    s_array = np.full((s_array_length,), b"<pad>")
    return np.concatenate((tweet, s_array), axis=0)

In [None]:
tweets[0]

'I am coming to the borders and I will kill you all,'

In [None]:
batched_tweets, batched_labels = batch_tweets(tweets, labels)

In [None]:
processed_tweets = np.array(list(map(lambda batch: [tweet_shaper(k, max_sentence_length) for k in batch], list(map(lambda i: preprocess_string(i), batched_tweets)))))

In [None]:
from collections import Counter
vocab = Counter()
for tweet in processed_tweets:
    for ins in tweet:
        vocab.update(ins)

In [None]:
vocab.most_common()[:10], len(vocab)

([(b'<pad>', 4176995),
  (b'<sos>', 43008),
  (b'the', 22706),
  (b'I', 20868),
  (b'to', 16232),
  (b'and', 14530),
  (b'a', 12570),
  (b'is', 10703),
  (b'of', 10475),
  (b'it', 9266)],
 25603)

In [None]:
vocab_size = 5000
truncated_vocab = [word[0] for word in vocab.most_common()[:vocab_size]]

In [None]:
truncated_vocab[:10]

[b'<pad>', b'<sos>', b'the', b'I', b'to', b'and', b'a', b'is', b'of', b'it']

In [None]:
words = tf.constant(truncated_vocab)
word_ids = tf.range(len(truncated_vocab), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [None]:
table.lookup(tf.constant(b"Hello boy".split()))

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([1027, 1090])>

In [None]:
encoded_tweets = tf.reshape(table.lookup(tf.constant(processed_tweets)), shape=(len(processed_tweets)*32, max_sentence_length))
labels = tf.reshape(batched_labels, shape=(len(batched_labels)*32,))

In [None]:
# one_hot_labels = tf.reshape(tf.one_hot(batched_labels, 4), shape=(len(batched_labels)*32, 4))

In [None]:
encoded_tweets.shape, labels.shape

(TensorShape([43008, 117]), TensorShape([43008]))

In [None]:
batched_labels.shape

(1344, 32)

## Binary

### Model Building

In [None]:
embed_size = 128
model = Sequential([
                    layers.Embedding(vocab_size + num_oov_buckets, 
                                     embed_size, input_shape=[max_sentence_length]), 
                    layers.Conv1D(128, kernel_size=1, activation="relu"), 
                    layers.Conv1D(64, kernel_size=1, activation="relu"), 
                    layers.MaxPool1D(), 
                    layers.Conv1D(32, kernel_size=1, activation="relu"), 
                    layers.Conv1D(16, kernel_size=1, activation="relu"), 
                    layers.MaxPool1D(), 
                    layers.Conv1D(8, kernel_size=1, activation="relu"), 
                    layers.Conv1D(4, kernel_size=1, activation="relu"), 
                    layers.MaxPool1D(), 
                    layers.Flatten(), 
                    layers.Dense(256, activation="relu"), 
                    layers.BatchNormalization(), 
                    layers.Dense(128, activation="relu"), 
                    layers.BatchNormalization(), 
                    layers.Dense(64, activation="relu"), 
                    layers.BatchNormalization(), 
                    # layers.GRU(128, return_sequences=True), 
                    # layers.LSTM(64, return_sequences=True, activation="relu"), 
                    # layers.GRU(32), 
                    # layers.LSTM(16, return_sequences=True), 
                    # layers.LSTM(8), 
                    layers.Dense(32, activation="relu"), 
                    layers.BatchNormalization(), 
                    layers.Dense(16, activation="relu"), 
                    layers.BatchNormalization(), 
                    layers.Dense(4, activation="relu"),
                    layers.Dense(1, activation="sigmoid")])

In [None]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 117, 128)          768000    
                                                                 
 conv1d_6 (Conv1D)           (None, 117, 128)          16512     
                                                                 
 conv1d_7 (Conv1D)           (None, 117, 64)           8256      
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 58, 64)           0         
 1D)                                                             
                                                                 
 conv1d_8 (Conv1D)           (None, 58, 32)            2080      
                                                                 
 conv1d_9 (Conv1D)           (None, 58, 16)            528       
                                                      

In [None]:
model.compile(loss="binary_crossentropy", optimizer="adam", 
              metrics=["accuracy"])

In [None]:
model.fit(encoded_tweets, labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fbfd7be0ad0>

### Testing

In [None]:
valid_df = pd.read_csv("/content/twitter_validation.csv")
valid_df.drop(valid_df.columns[:2], axis=1, inplace=True)
valid_df.rename(columns={valid_df.columns[0]: 'sentiment', valid_df.columns[1]: 'tweet'}, 
                inplace=True, errors='raise')
valid_df.dropna(inplace=True)
simple_valid_df = valid_df[(valid_df["sentiment"] == "Positive") | (valid_df["sentiment"] == "Negative")]
simple_valid_df.sentiment = [sentiment_map[i] for i in simple_valid_df.sentiment]
valid_tweets = simple_valid_df.tweet.to_numpy()
valid_labels = simple_valid_df.sentiment.to_numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [None]:
batch_valid_data, batch_valid_labels = batch_tweets(valid_tweets, valid_labels)
processed_tweets = np.array(list(map(lambda batch: [tweet_shaper(k, max_sentence_length) for k in batch], list(map(lambda i: preprocess_string(i), batch_valid_data)))))
encoded_valid_tweets = tf.reshape(table.lookup(tf.constant(processed_tweets)), shape=(len(processed_tweets)*32, max_sentence_length))
valid_labels = tf.reshape(batch_valid_labels, shape=(len(batch_valid_labels)*32,))

In [None]:
model.evaluate(encoded_valid_tweets, valid_labels)



[0.08412598818540573, 0.978515625]

In [None]:
model.predict(encoded_valid_tweets[50:51])

array([[0.98261917]], dtype=float32)

In [None]:
valid_labels[50]

<tf.Tensor: shape=(), dtype=int64, numpy=1>

## Multiclass

### data 

In [None]:
train_df = pd.read_csv("/content/twitter_training.csv")
train_df.drop(train_df.columns[:2], axis=1, inplace=True)
train_df.rename(columns={train_df.columns[0]: 'sentiment', train_df.columns[1]: 'tweet'}, 
                inplace=True, errors='raise')
train_df.dropna(inplace=True)
# simple_train_df = train_df[(train_df["sentiment"] == "Positive") | (train_df["sentiment"] == "Negative")]
sentiment_map = {k:i for i, k in enumerate(list(dict(train_df["sentiment"].value_counts()).keys()))}
train_df.sentiment = [sentiment_map[i] for i in train_df.sentiment]
train_tweets = train_df.tweet.to_numpy()
train_labels = train_df.sentiment.to_numpy()

In [None]:
train_tweets.shape, train_labels.shape

((73995,), (73995,))

In [None]:
batch_train_data, batch_train_labels = batch_tweets(train_tweets, train_labels)
processed_tweets = np.array(list(map(lambda batch: [tweet_shaper(k, max_sentence_length) for k in batch], 
                                     list(map(lambda i: preprocess_string(i), batch_train_data)))))
encoded_train_tweets = tf.reshape(table.lookup(tf.constant(processed_tweets)), shape=(len(processed_tweets)*32, max_sentence_length))
train_labels = tf.one_hot(batch_train_labels, depth=4)
train_labels = tf.reshape(train_labels, shape=(len(batch_train_labels)*32,4))

In [None]:
encoded_train_tweets.shape, train_labels.shape

(TensorShape([73984, 148]), TensorShape([73984, 4]))

### Evaluation

In [None]:
valid_df = pd.read_csv("/content/twitter_validation.csv")
valid_df.drop(valid_df.columns[:2], axis=1, inplace=True)
valid_df.rename(columns={valid_df.columns[0]: 'sentiment', valid_df.columns[1]: 'tweet'}, 
                inplace=True, errors='raise')
valid_df.dropna(inplace=True)
# simple_valid_df = valid_df[(valid_df["sentiment"] == "Positive") | (valid_df["sentiment"] == "Negative")]
sentiment_map = {k:i for i, k in enumerate(list(dict(valid_df["sentiment"].value_counts()).keys()))}
valid_df.sentiment = [sentiment_map[i] for i in valid_df.sentiment]
valid_tweets = valid_df.tweet.to_numpy()
valid_labels = valid_df.sentiment.to_numpy()

In [None]:
valid_tweets.shape, valid_labels.shape

((999,), (999,))

In [None]:
batch_valid_data, batch_valid_labels = batch_tweets(valid_tweets, valid_labels)
processed_tweets = np.array(list(map(lambda batch: [tweet_shaper(k, max_sentence_length) for k in batch], 
                                     list(map(lambda i: preprocess_string(i), batch_valid_data)))))
encoded_valid_tweets = tf.reshape(table.lookup(tf.constant(processed_tweets)), shape=(len(processed_tweets)*32, max_sentence_length))
valid_labels = tf.one_hot(batch_valid_labels, depth=4)
valid_labels = tf.reshape(valid_labels, shape=(len(batch_valid_labels)*32,4))

In [None]:
encoded_valid_tweets.shape, valid_labels.shape

(TensorShape([992, 148]), TensorShape([992, 4]))

In [None]:
# model2.evaluate(encoded_valid_tweets, valid_labels)

### model

In [None]:
# model.layers

In [None]:
embed_size = 128
model2 = Sequential([
                layers.Embedding(vocab_size + num_oov_buckets, 
                                    embed_size, input_shape=[max_sentence_length]), 
                # layers.GRU(512, return_sequences=True, activation="relu"), 
                layers.Conv1D(512, kernel_size=1, activation="relu"), 
                layers.Conv1D(256, kernel_size=1, activation="relu"), 
                layers.MaxPool1D(),
                # layers.Dropout(0.5), 
                layers.Conv1D(256, kernel_size=1, activation="relu"), 
                layers.Conv1D(128, kernel_size=1, activation="relu"), 
                layers.MaxPool1D(), 
                # layers.Dropout(0.5), 
                layers.Conv1D(128, kernel_size=1, activation="relu"), 
                layers.Conv1D(64, kernel_size=1, activation="relu"), 
                layers.MaxPool1D(), 
                layers.Conv1D(64, kernel_size=1, activation="relu"), 
                layers.Conv1D(32, kernel_size=1, activation="relu"), 
                layers.MaxPool1D(),
                layers.Conv1D(32, kernel_size=1, activation="relu"), 
                layers.Conv1D(16, kernel_size=1, activation="relu"), 
                layers.MaxPool1D(),
                layers.Conv1D(16, kernel_size=1, activation="relu"), 
                layers.Conv1D(8, kernel_size=1, activation="relu"), 
                layers.MaxPool1D(),
                # layers.Conv1D(8, kernel_size=1, activation="relu"), 
                # layers.Conv1D(8, kernel_size=1, activation="relu"), 
                # layers.MaxPool1D(),
                # layers.Dropout(0.5), 
                # layers.Conv1D(16, kernel_size=1, activation="relu"), 
                # layers.Conv1D(16, kernel_size=1, activation="relu"), 
                # layers.MaxPool1D(), 
                # layers.Conv1D(8, kernel_size=1, activation="relu"), 
                # layers.Conv1D(8, kernel_size=1, activation="relu"), 
                # layers.MaxPool1D(), 
                layers.Flatten(), 
                # layers.Dense(256, activation="relu"), 
                # layers.BatchNormalization(), 
                # layers.Dense(128, activation="relu"), 
                # layers.BatchNormalization(), 
                # layers.Dropout(0.5), 
                # layers.Dense(64, activation="relu"), 
                # layers.BatchNormalization(), 
                # layers.Dropout(0.5), 
                # layers.Dense(32, activation="relu"), 
                # layers.BatchNormalization(), 
                # layers.Dropout(0.5), 
                # layers.Dense(16, activation="relu"), 
                # layers.BatchNormalization(), 
                layers.Dense(4, activation="softmax")])
                # layers.Dense(1, activation="sigmoid")]
model2.compile(loss="categorical_crossentropy", optimizer="adam", 
               metrics=["accuracy"])

In [None]:
model2.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 148, 128)          768000    
                                                                 
 conv1d_36 (Conv1D)          (None, 148, 512)          66048     
                                                                 
 conv1d_37 (Conv1D)          (None, 148, 256)          131328    
                                                                 
 max_pooling1d_18 (MaxPoolin  (None, 74, 256)          0         
 g1D)                                                            
                                                                 
 conv1d_38 (Conv1D)          (None, 74, 256)           65792     
                                                                 
 conv1d_39 (Conv1D)          (None, 74, 128)           32896     
                                                      

In [None]:
model2.fit(encoded_train_tweets, train_labels, epochs=25, 
           validation_data=(encoded_valid_tweets, valid_labels), 
           callbacks=[keras.callbacks.EarlyStopping(patience=3, 
                                                    monitor="val_accuracy",
                                                    restore_best_weights=True)])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25


<keras.callbacks.History at 0x7f4b900d1110>

In [None]:
# model2.fit(encoded_train_tweets, train_labels, initial_epoch=10, epochs=100)

In [None]:
max_sentence_length