In [1]:
# File Loading and data preprocessing
import numpy as np
import math
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
from google.colab import drive

# Tensorflow
try:
  %tensorflow_version 2.x
except:
  pass
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

# Get rid of the warnings
import warnings
warnings.filterwarnings("ignore")

TensorFlow 2.x selected.


In [2]:
# Loading the test and the training files
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
dataset = pd.read_csv("/content/drive/My Drive/Twitter Sentimental Analysis/training_set.csv",
                           header = None,
                           names = cols,
                           engine = "python",
                           encoding = "latin1")


In [4]:
dataset.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [0]:
# Data preprocessing
data_trimmed = dataset.loc[:, ["sentiment", "text"]]


del dataset

In [0]:
def tweet_processing(tweet):
  tweet = BeautifulSoup(tweet, "lxml").get_text()
  # geting rid of all the @ mentions
  tweet = re.sub(r"@[a-zA-Z0-9]+", " ", tweet)
  # getting rid of the url links
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", " ", tweet)
  tweet = re.sub(r"[^a-zA-Z.!?']", " ", tweet)
  tweet = re.sub(r" +", " ", tweet)
  return tweet

In [7]:
data_trimmed["sentiment"].value_counts()

4    800000
0    800000
Name: sentiment, dtype: int64

In [0]:
# Create a list of twitter texts
data_cleaned = [tweet_processing(tweet) for tweet in data_trimmed["text"]]

In [0]:
# Fix the label for sentiments to 0 and 1 instead of 0 and 4
sentiment_labels = data_trimmed.sentiment.values
sentiment_labels[sentiment_labels == 4] = 1

In [0]:
# Tokenization
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    data_cleaned, target_vocab_size = 64000
)
data_inputs = [tokenizer.encode(tweet) for tweet in data_cleaned]

In [0]:
# Padding: add paddings to each sentence to ensure that they all have the same length (good for batch processing)
max_length = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs, value = 0, 
                                                            padding = "post", 
                                                            maxlen = max_length)


In [12]:
type(len(data_inputs))

int

In [0]:
# Splitting data to training set and test set
# Check how many tweets there are in total
num_tweets = len(data_inputs)

# I will use 1 percent of the total tweets as the test set and the remaining as training set
test_id_negative = np.random.randint(0, int(num_tweets/2), int(num_tweets/200)).astype(int)
test_id_positive = np.random.randint(int(num_tweets/2), num_tweets, int(num_tweets/200)).astype(int)
test_id_total = np.concatenate((test_id_negative, test_id_positive))

In [0]:
# Test set
test_inputs = data_inputs[test_id_total]
test_labels = sentiment_labels[test_id_total]

In [0]:
# Training set
training_inputs = np.delete(data_inputs, test_id_total, axis = 0)
training_labels = np.delete(sentiment_labels, test_id_total, axis = 0)

### Building the CNN Model for the Natural Language Processing

In [0]:
class Deep_CNN(tf.keras.Model):
  def __init__(self, 
               vocab_size,
               embedding_dim = 128,
               nb_filters = 50,
               Feed_Forward_N_units = 512,
               nb_classes = 2,
               dropout_rate = 0.1,
               training = False,
               name = "deep_cnn"):
    
    super(Deep_CNN, self).__init__(name = name)

    # defining the three different convolutional filter layers and pooling layers
    self.embedding = layers.Embedding(vocab_size,
                                      embedding_dim
                                      )
    self.bigram = layers.Conv1D(filters = nb_filters,
                                kernel_size = 2,
                                padding = "valid",
                                activation = "relu"
                                )
    self.pool_1 = layers.GlobalMaxPool1D()

    self.trigram = layers.Conv1D(filters = nb_filters,
                                 kernel_size = 3,
                                 padding = "valid",
                                 activation = "relu"
                                 )
    self.pool_2 = layers.GlobalMaxPool1D()

    self.fourgram = layers.Conv1D(filters = nb_filters,
                                 kernel_size = 4,
                                 padding = "valid",
                                 activation = "relu"
                                 )
    self.pool_3 = layers.GlobalMaxPool1D()

    # Define the first dense feed-forward layer
    self.dense_1 = layers.Dense(units = Feed_Forward_N_units,
                                activation = "relu"
                                )
    # Add a dropout layer
    self.dropout_1 = layers.Dropout(rate = dropout_rate)

    # Define the second/last dense feed-forward layer
    if nb_classes == 2:
      self.last_dense = layers.Dense(units = 1,
                                     activation = "sigmoid",
                                    )
    elif nb_classes > 2:
      self.last_dense = layers.Dense(units = nb_classes,
                                     activation = "softmax")
  def call(self, inputs, training):
    x = self.embedding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool_1(x_1)
    x_2 = self.trigram(x)
    x_2 = self.pool_2(x_2)
    x_3 = self.fourgram(x)
    x_3 = self.pool_3(x_3)

    merged = tf.concat([x_1, x_2, x_3], axis = -1)
    merged = self.dense_1(merged)
    merged = self.dropout_1(merged, training)
    output = self.last_dense(merged)

    return output

In [0]:
# Initialize global variables
vocab_size = tokenizer.vocab_size
embedding_dim = 200
nb_filters = 100
Feed_Forward_N_units = 256
nb_classes = len(set(training_labels))
dropout_rate = 0.2
batch_size = 32
epochs = 10


In [0]:
# Train the model
DCNN = Deep_CNN(vocab_size = vocab_size,
                embedding_dim = embedding_dim,
                nb_filters = nb_filters,
                Feed_Forward_N_units = Feed_Forward_N_units,
                nb_classes = nb_classes,
                dropout_rate = dropout_rate
                )

In [0]:
if nb_classes == 2:
  DCNN.compile(loss = "binary_crossentropy",
               optimizer = "adam",
               metrics = ["accuracy"])
  
elif nb_classes > 2:
  DCNN.compile(loss = "sparse_categorical_crossentropy",
               optimizer = "adam",
               metrics = ["sparse_categorical_accuracy"])

In [0]:
checkpoint_path = "/content/drive/My Drive/Twitter Sentimental Analysis/saves/"
ckpt = tf.train.Checkpoint(DCNN = DCNN)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep = 1)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("The Latest Checkpoint has been restored!")




In [0]:
# Fit the model:
DCNN.fit(training_inputs, 
         training_labels, 
         batch_size = batch_size,
         epochs = epochs
         )
ckpt_manager.save()

Train on 1584078 samples
Epoch 1/10

In [0]:
# Evaluation
accuracy_results = DCNN.evaluate(test_inputs, test_labels, batch_size = batch_size)
print(accuracy_results)

In [0]:
# Testing with custom sentences
# (This is a recent tweet from presidential candidate Bernie Sanders)
DCNN(np.array(tokenizer.encode("I'm proud to have opposed all of Trump's military budgets.")), training = False).numpy()