In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 1.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 38.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 58.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
import sys, os, time
import tensorflow as tf
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
from transformers import BertTokenizerFast

device = torch.device("cuda")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls /content/drive/MyDrive/nns

lstm  saved_tensors  training.1600000.processed.noemoticon.csv


In [None]:
load_dir = '/content/drive/MyDrive/nns/saved_tensors/'
train_seq = torch.load(load_dir+'train_seq.pt')
train_y = torch.load(load_dir+'train_y.pt')

val_seq = torch.load(load_dir+'val_seq.pt')
val_y = torch.load(load_dir+'val_y.pt')

test_seq = torch.load(load_dir+'test_seq.pt')
test_y = torch.load(load_dir+'test_y.pt')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/nns/training.1600000.processed.noemoticon.csv", encoding = "latin-1", low_memory=False)
df = df[['text', 'target']]
df.columns = ['text', 'target']
df['target'] = df['target'].replace(4, 1)
train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['target'], random_state=2022, test_size=0.3, stratify=df['target'])
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, random_state=2022, test_size=0.5, stratify=temp_labels)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
seq_len = [len(i.split()) for i in train_text]
max_seq_len = max(seq_len)


In [None]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True
)
# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True
)



In [None]:
# Convert Integer Sequences to Tensors
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [None]:
train_seq = tf.cast(torch.tensor(tokens_train['input_ids']), tf.float32)
train_mask = tf.cast(torch.tensor(tokens_train['attention_mask']), tf.float32)
train_y = tf.cast(torch.tensor(train_labels.tolist()), tf.float32)

val_seq = tf.cast(torch.tensor(tokens_val['input_ids']), tf.float32)
val_mask = tf.cast(torch.tensor(tokens_val['attention_mask']), tf.float32)
val_y = tf.cast(torch.tensor(val_labels.tolist()), tf.float32)

test_seq = tf.cast(torch.tensor(tokens_test['input_ids']), tf.float32)
test_mask = tf.cast(torch.tensor(tokens_test['attention_mask']), tf.float32)
test_y = tf.cast(torch.tensor(test_labels.tolist()), tf.float32)

In [None]:
#NN Hyper-Parameters
num_epochs = 10
batch_size = 32
learning_rate = 1e-3
max_seq_len = 35
embedding_dim = 512
lstm_units = 256

In [None]:
#Model create lstm
class LSTMClassifier(tf.keras.Model):
    def __init__(self, lstm_units, max_seq_len, embedding_dim, batch_size):
        super(LSTMClassifier, self).__init__()
        self.lstm_units = lstm_units
        self.batch_size = batch_size
        self.max_seq_len = max_seq_len
        self.embedding = tf.keras.layers.Embedding(max_seq_len, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(self.lstm_units, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(1, activation='sigmoid')
        
    def call(self, inputs, hidden):
        x = self.embedding(inputs)
        output, state_h, state_c = self.lstm(x, initial_state=hidden)
        output = tf.reshape(output, (-1, output.shape[2]))
        output = self.dense(output)
        return output, state_h, state_c
    
    def init_hidden_state(self):
        return (tf.zeros((self.batch_size, self.lstm_units)), tf.zeros((self.batch_size, self.lstm_units)))

In [None]:
# train model
model = LSTMClassifier(lstm_units, max_seq_len, embedding_dim, batch_size)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_object = tf.keras.losses.BinaryCrossentropy()
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')
val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.BinaryAccuracy(name='val_accuracy')


In [None]:
# define function to train the model
@tf.function
def train_step(model, inputs, labels, hidden):
    with tf.GradientTape() as tape:
        predictions, _, _ = model(inputs, hidden)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss(loss)
    train_accuracy(labels, predictions)

# define function to evaluate the model
@tf.function
def val_step(model, inputs, labels, hidden):
    predictions, _, _ = model(inputs, hidden)
    v_loss = loss_object(labels, predictions)
    val_loss(v_loss)
    val_accuracy(labels, predictions)

In [None]:
# train the model
for epoch in range(num_epochs):
    hidden = model.init_hidden_state()
    for (batch, (inputs, labels)) in enumerate(train_seq.take(batch_size)):
        train_step(model, inputs, labels, hidden)
        template = 'Epoch {}, Batch {}, Loss: {}, Accuracy: {}'
        print(template.format(epoch+1,
                              batch,
                              train_loss.result(),
                              train_accuracy.result()*100))
    hidden = model.init_hidden_state()
    for (batch, (inputs, labels)) in enumerate(val_seq):
        val_step(model, inputs, labels, hidden)
        template = 'Epoch {}, Batch {}, Loss: {}, Accuracy: {}'
        print(template.format(epoch+1,
                              batch,
                              val_loss.result(),
                              val_accuracy.result()*100))