In [1]:
#importing dependencies 
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [2]:
try:
    %tensorflow_version 2.x
except Exception:
    pass

In [3]:
#Read in data 
cols = ["sentiment", "id", "date", "query", "user", "text"]
train_data = pd.read_csv("/content/drive/My Drive/NLP/Sentimental analysis-/Dataset/training.csv",
    engine="python",
    names=cols,
    encoding="latin1")

test_data = pd.read_csv("/content/drive/My Drive/NLP/Sentimental analysis-/Dataset/testdata.csv",
    engine="python",
    names=cols,
    encoding="latin1")

In [4]:
train_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
test_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [6]:
print(train_data.info())
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentiment  1600000 non-null  int64 
 1   id         1600000 non-null  int64 
 2   date       1600000 non-null  object
 3   query      1600000 non-null  object
 4   user       1600000 non-null  object
 5   text       1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  498 non-null    int64 
 1   id         498 non-null    int64 
 2   date       498 non-null    object
 3   query      498 non-null    object
 4   user       498 non-null    object
 5   text       498 non-null    object
dtypes: int64(2), object(4)
memory usage: 23.5+ KB
None


In [7]:
data=train_data

In [8]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [9]:
data

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


*Preprocessing*

In [10]:
#Cleaning 
def clean_tweet(tweet):
    #lowercase letters
    tweet=tweet.lower()
    #removing HTML tags
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [11]:
#test function... 
for i in range (5):
  print("Original Tweet: ",data.text[i])
  print("Clean Tweet: ",clean_tweet(data.text[i]))

Original Tweet:  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
Clean Tweet:   awww that's a bummer. you shoulda got david carr of third day to do it. d
Original Tweet:  is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!
Clean Tweet:  is upset that he can't update his facebook by texting it... and might cry as a result school today also. blah!
Original Tweet:  @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
Clean Tweet:   i dived many times for the ball. managed to save the rest go out of bounds
Original Tweet:  my whole body feels itchy and like its on fire 
Clean Tweet:  my whole body feels itchy and like its on fire 
Original Tweet:  @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. 
Clean Tweet:   no it's not behaving at all. i'm mad. why am i here? bec

In [12]:
#mapping function..
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [13]:
data_labels = data.sentiment.values
# print(data_labels)
data_labels[data_labels == 4] = 1
print(data_labels)

[0 0 0 ... 1 1 1]


*Tokenization*

In [14]:
#init tokenizer
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size=2**16)

In [15]:
#mapping tokenizer
data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

In [16]:
data_inputs[1]

[10,
 990,
 23,
 111,
 67,
 66505,
 15,
 676,
 185,
 699,
 129,
 2544,
 30,
 29,
 8,
 296,
 757,
 83,
 6,
 3117,
 182,
 80,
 2536,
 2,
 2266,
 66499]

***Padding***

In [17]:
#maximum sentence length for padding 
MAX_LEN = max([len(sentence) for sentence in data_inputs])
print(MAX_LEN)

72


In [18]:
#sequence padding
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

In [19]:
#checking padded data inputs 
data_inputs[1]

array([   10,   990,    23,   111,    67, 66505,    15,   676,   185,
         699,   129,  2544,    30,    29,     8,   296,   757,    83,
           6,  3117,   182,    80,  2536,     2,  2266, 66499,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0],
      dtype=int32)

In [20]:
#splitting dataset to train/test
test_idx = np.random.randint(0, 800000, 8000)
test_idx = np.concatenate((test_idx, test_idx+800000))

test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

In [21]:
#checking diminsions 
print("train data diminsion: ",train_inputs.shape)
print("train labels diminsion: ",train_labels.shape)
print("test data diminsion: ",test_inputs.shape)
print("test labels diminsion: ",test_labels.shape)

train data diminsion:  (1584068, 72)
train labels diminsion:  (1584068,)
test data diminsion:  (16000, 72)
test labels diminsion:  (16000,)


Model **Building**

In [22]:
#model
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        #three layers of CONV 1D
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D() # no training variable so we can
                                             # use the same layer for each
                                             # pooling step
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        #Defining last layer activation depending on number of classes exist
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

In [28]:
#Defining Model Parameters
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2
DROPOUT_RATE = 0.2
BATCH_SIZE = 128
NB_EPOCHS = 1

In [29]:
#Training 
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [30]:
#Defining losses for binary/multi class labels 
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [31]:
#Defining checkpoint for the model
checkpoint_path = "/content/drive/My Drive/NLP/Sentimental analysis-"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


In [32]:
#Model Training
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)
ckpt_manager.save()



'/content/drive/My Drive/NLP/Sentimental analysis-/ckpt-2'

Evaluation

In [35]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print("Model losses: ",results[0])
print("Model acc:  ",results[1])

Model losses:  0.22106042504310608
Model acc:   0.9100000262260437


In [58]:
def evaluate(sentence):
  SENT_TOKENIZE=[tokenizer.encode(sentence)]
  SENT_TOKENIZE=np.array(SENT_TOKENIZE)
  return Dcnn(SENT_TOKENIZE, training=False).numpy()[0][0]

In [61]:
print(evaluate("hello ! so happy to meet you"))
print(evaluate("piss off! i dont wanna see you again!"))

0.9979235
0.08198029
