##***Configuration***

In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tabulate import tabulate

## ***Drive Connection***

In [None]:
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##***Load Dataset***

In [None]:
# reading the csv file from my drive
df = pd.read_csv('/content/drive/My Drive/training.csv', encoding='latin-1')
#defining the csv columns
df.columns=['target','id','date','query-flag','user', 'tweet_text']
# convert values to 0 for negative and 1 for postive
df['target'] = np.where(df['target'] == 4,1,0)

In [None]:

print(df['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599994    1
1599995    1
1599996    1
1599997    1
1599998    1
Name: target, Length: 1599999, dtype: int64


In [None]:
print(len(df))

1599999


##***Dataset Splitting***

In [None]:
#Split the dataset into Training [80%], Validation [10%], Testing [10%] sets.
train_df, test_validation_df = train_test_split(df, test_size=0.2, random_state=123)
validation_df, test_df = train_test_split(test_validation_df, test_size=0.5, random_state=123)

# create datasets from the training and testing DataFrames
train_dataset = tf.data.Dataset.from_tensor_slices((
    train_df['tweet_text'].values,
    train_df['target'].values
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    test_df['tweet_text'].values,
    test_df['target'].values
))

# apply batch and prefetch functions to the datasets
BATCH_SIZE = 64
train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

print(train_dataset)
print(test_dataset)

<_PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>
<_PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>


In [None]:
print("Training : " + str(len(train_df)))
print("Validation : " +str(len(validation_df)))
print("Testing : " +str(len(test_df)))
# 1279999, 160000, 160000

Training : 1279999
Validation : 160000
Testing : 160000


In [None]:
print(train_df['target'])

1559756    1
51568      0
569210     0
71896      0
405089     0
          ..
1241052    1
1066306    1
28030      0
277869     0
773630     0
Name: target, Length: 1279999, dtype: int64


In [None]:
for index, row in df.iterrows():
    text =row['tweet_text']
    label = row['target']

    text_np = np.array(text)
    label_np = np.array(label)
    print('text: ', text_np)
    print('label: ', label_np)

##***Preprocess***

In [None]:
# Preprocess the text and create embeddings
max_words = 10000
max_length = 100

In [None]:
encoder = TextVectorization(max_tokens=max_words, output_mode="int", output_sequence_length=max_length)
encoder.adapt(train_df["tweet_text"].values)

In [None]:
vocabulary = encoder.get_vocabulary()
word_index = dict(zip(vocabulary,range(len(vocabulary))))

###**RNN**

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.SimpleRNN(32),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, 100)              0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 100, 64)           640000    
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                3104      
                                                                 
 dense_2 (Dense)             (None, 64)                2112      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 645,281
Trainable params: 645,281
Non-trainable params: 0
________________________________________________

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(
   train_dataset, epochs=5,
                    validation_data=test_dataset,
                    validation_steps=30
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


###**LSTM**

In [None]:
model2 = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
model2.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model2.fit(
   train_dataset, epochs=5,
                    validation_data=test_dataset,
                    validation_steps=30
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


##***Prediction***

In [None]:
test_loss1, test_acc1 = model.evaluate(test_dataset)
print("RNN model:")
print('Test Loss:', test_loss1)
print('Test Accuracy:', test_acc1)


test_loss2, test_acc2 = model2.evaluate(test_dataset)
print("LSTM model:")
print('Test Loss:', test_loss2)
print('Test Accuracy:', test_acc2)



RNN model:
Test Loss: 0.4155820310115814
Test Accuracy: 0.7963937520980835
LSTM model:
Test Loss: 0.3971339464187622
Test Accuracy: 0.8052999973297119


In [None]:
table = [
    ["RNN Model", test_loss1, test_acc1],
    ["LSTM Model", test_loss2, test_acc2]
]

headers = ["Model","Test Loss", "Test Accuracy"]
print(tabulate(table, headers= headers))

Model         Test Loss    Test Accuracy
----------  -----------  ---------------
RNN Model      0.415582         0.796394
LSTM Model     0.397134         0.8053
