In [1]:
import numpy
import pandas
import re
import seaborn
import time
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
from matplotlib import pyplot
import nltk
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import DataLoader, Dataset
from typing import Any
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lolitav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
training_data_frame = pandas.read_csv('twitter_training.csv', names=['id','entity','label','tweet'])
testing_data_frame = pandas.read_csv('twitter_validation.csv', names=['id','entity','label','tweet'])

print('The shape of the training data frame: ' + str(training_data_frame.shape))
print('The shape of the testing data frame: ' + str(testing_data_frame.shape))

training_data_frame = training_data_frame[['label', 'tweet']]
testing_data_frame = testing_data_frame[['label', 'tweet']]

training_data_frame['tokens_count'] = training_data_frame['tweet'].apply(lambda text: len(str(text).split(' ')))
testing_data_frame['tokens_count'] = testing_data_frame['tweet'].apply(lambda text: len(str(text).split(' ')))

print('The description for the training data frame:')
print(training_data_frame.describe())
print('The description for the training data frame:')
print(testing_data_frame.describe())

The shape of the training data frame: (74682, 4)
The shape of the testing data frame: (1000, 4)
The description for the training data frame:
       tokens_count
count  74682.000000
mean      19.237340
std       14.502922
min        1.000000
25%        8.000000
50%       16.000000
75%       27.000000
max      198.000000
The description for the training data frame:
       tokens_count
count   1000.000000
mean      20.701000
std       13.354023
min        1.000000
25%       10.000000
50%       17.000000
75%       31.000000
max       57.000000


In [3]:
def add_padding_to_text(text, needed_length):
    result_text = numpy.zeros(needed_length, dtype=numpy.int64)

    if len(text) > needed_length:
        result_text[:] = text[:needed_length]
    else:
        result_text[:len(text)] = text

    return result_text

def clear_punctuation_signs(text):
    text = re.sub(r'[\'#$%&\'@_`~]', r'', text)
    text = re.sub(r'[!\\()*+,-./:;<=>?\[\]^{|}]', r' ', text)
    text = text.strip()
    text = text.replace('\n', ' ')

    return text

def filter_non_alphabetic_chars(text):
    alphabetic_text = ''

    for word in text.split():
        alphabetic_word = re.sub('[^a-z A-Z]+', '', word)
        alphabetic_text += alphabetic_word
        alphabetic_text += ' '

    alphabetic_text = alphabetic_text.strip()

    return alphabetic_text


def remove_repeated_chars(text):
    return re.sub(r'(\w)(\1{2,})', r'\1', text)

def remove_stopwords(text):
    stop_word = set(stopwords.words('english'))
    re_stop_words = re.compile(r'\b(' + '|'.join(stopwords.words('english')) + ')\\W', re.I)

    return re_stop_words.sub('', text)

def process_the_text(text):
    text = str(text).lower()
    text = clear_punctuation_signs(text)
    text = filter_non_alphabetic_chars(text)
    text = remove_repeated_chars(text)
    text = remove_stopwords(text)

    return text

In [4]:
training_data_frame = training_data_frame.loc[training_data_frame['tokens_count'] < 60]
testing_data_frame = testing_data_frame.loc[testing_data_frame['tokens_count'] < 60]

training_data_frame = training_data_frame.sample(n=74000)

print(training_data_frame.shape)
print(testing_data_frame.shape)

training_data_frame['tweet'] = training_data_frame['tweet'].apply(process_the_text)
testing_data_frame['tweet'] = testing_data_frame['tweet'].apply(process_the_text)

(74000, 3)
(1000, 3)


In [5]:
class Tweet_Vocabulary:
    
    def __init__(self, word_set):
        self.word2index = {}
        self.index2word = {}
        self.vocab = word_set
        self.create_index()
        
    def create_index(self):
        self.vocab = sorted(self.vocab)

        self.word2index['<padding>'] = 0
        self.index2word[0] = '<padding>'

        for index, word in enumerate(self.vocab):
            self.word2index[word] = index + 1
            self.index2word[index + 1] = word

In [6]:
all_words_set = set()

for row in training_data_frame['tweet']:
    all_words_set.update(str(row).split(' '))

for row in testing_data_frame['tweet']:
    all_words_set.update(str(row).split(' '))

tweet_vocab = Tweet_Vocabulary(all_words_set)
print('The number of unique words:')
print(len(tweet_vocab.vocab))

The number of unique words:
30338


In [7]:
training_data_tensor = [
    [
        tweet_vocab.word2index[word] for word in tweet.split(' ')
    ]
    for tweet in training_data_frame['tweet']
]

testing_data_tensor = [
    [
        tweet_vocab.word2index[word] for word in tweet.split(' ')
    ]
    for tweet in testing_data_frame['tweet']
]

max_length_training = max(len(x) for x in training_data_tensor)
max_length_testing = max(len(x) for x in testing_data_tensor)

training_data_tensor = [add_padding_to_text(text, max_length_training) for text in training_data_tensor]
testing_data_tensor = [add_padding_to_text(text, max_length_testing) for text in testing_data_tensor]

In [8]:
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

training_sentiment_array = numpy.array(training_data_frame['label'])
training_sentiment_array = training_sentiment_array.reshape(-1, 1)
training_target_tensor = one_hot_encoder.fit_transform(training_sentiment_array).toarray()

testing_sentiment_array = numpy.array(testing_data_frame['label'])
testing_sentiment_array = testing_sentiment_array.reshape(-1, 1)
testing_target_tensor = one_hot_encoder.fit_transform(testing_sentiment_array).toarray()

training_input = training_data_tensor
training_target = training_target_tensor
testing_input = testing_data_tensor
testing_target = testing_target_tensor

In [9]:
class Twitter_Dataset(Dataset):

    def __init__(self, data, target):
        self.data = data
        self.target = target
        self.length = [numpy.sum(1 - numpy.equal(i, 0)) for i in data]
        
    def __getitem__(self, index):
        data = self.data[index]
        target = self.target[index]
        data_length = self.length[index]
        
        return data, target, data_length
    
    def __len__(self):
        return len(self.data)


In [10]:
TRAINING_BUFFER_SIZE = len(training_input)
TESTING_BUFFER_SIZE = len(testing_input)
BATCH_SIZE = 64

TRAINING_BATCH_COUNT = TRAINING_BUFFER_SIZE // BATCH_SIZE
TESTING_BATCH_COUNT = TESTING_BUFFER_SIZE // BATCH_SIZE

training_dataset = Twitter_Dataset(training_input, training_target)
testing_dataset = Twitter_Dataset(testing_input, testing_target)

training_dataset = DataLoader(training_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=True)
testing_dataset = DataLoader(testing_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=True)

embedding_dim = 256
units = 1024
vocabulary_size = len(tweet_vocab.word2index)
target_size = len(training_target_tensor[0])

In [11]:
target_size

4

## Test setup

In [14]:
def loss_function(y, prediction, criterion):
    target = torch.max(y, 1)[1]
    loss = criterion(prediction, target)

    return loss

def accuracy(target, prediction):
    target = torch.max(target, 1)[1]
    correct_count = (torch.max(prediction, 1)[1].data == target).sum()
    accuracy = 100. * correct_count / len(prediction)

    return accuracy

In [15]:
class GRU_Model(nn.Module):

    def __init__(self, param, vocab_size, embedding_dim, hidden_units, batch_size, output_size):
        super(GRU_Model, self).__init__()
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_units = param['n_unit']
        self.output_size = output_size

        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.dropout = nn.Dropout(p=param['dropout'])
        self.gru = nn.GRU(self.embedding_dim, self.hidden_units)
        self.linear = nn.Linear(self.hidden_units, self.output_size)

    def initialize_hidden_state(self, device):
        return torch.zeros((1, self.batch_size, self.hidden_units)).to(device)

    def forward(self, input_text, device):
        input_text = self.embedding(input_text)
        self.hidden_state = self.initialize_hidden_state(device)
        output, self.hidden_state = self.gru(input_text, self.hidden_state)
        out = output[-1, :, :]
        out = self.dropout(out)
        out = self.linear(out)

        return out, self.hidden_state

    def _forward_unimplemented(self, *input_text: Any) -> None:
        pass
 
def train_and_evaluate(param, trial):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GRU_Model(param, vocabulary_size, param['embed_dim'], units, BATCH_SIZE, target_size)
    # model = GRU_Model(param, vocabulary_size, embedding_dim, units, BATCH_SIZE, target_size)

    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = getattr(optim, param['optimizer'])(model.parameters(), lr= param['learning_rate'])

    EPOCHS = param['n_epochs']

    for epoch in range(EPOCHS):

        start = time.time()
        total_loss, training_accuracy, validation_accuracy = 0, 0, 0

        for (batch, (batch_input, batch_target, batch_length)) in enumerate(training_dataset):
            loss = 0
            predictions, _ = model(batch_input.permute(1, 0).to(device), device)

            loss += loss_function(batch_target.to(device), predictions, criterion)
            batch_loss = (loss / int(batch_target.shape[1]))
            total_loss += batch_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_accuracy = accuracy(batch_target.to(device), predictions)
            training_accuracy += batch_accuracy

            # if batch % 100 == 0:
            #     print('Epoch {} | Batch {} | Training Loss {:.4f}'.format(epoch + 1, batch, batch_loss.cpu().detach().numpy()))

        # for (batch, (batch_input, batch_target, batch_length)) in enumerate(validation_dataset):
        #     predictions, _ = model(batch_input.permute(1, 0).to(device), device)
        #     batch_accuracy = accuracy(batch_target.to(device), predictions)
        #     validation_accuracy += batch_accuracy

        # print('Epoch {} | Loss {:.4f} | Training Acc. {:.4f} | Validation Acc. {:.4f}'.
        #     format(epoch + 1, 
        #             total_loss / TRAINING_BATCH_COUNT, 
        #             training_accuracy / TRAINING_BATCH_COUNT, 
        #             validation_accuracy / VALIDATION_BATCH_COUNT))

        # print('Time taken for the epoch #{} : {} sec\n'.format(epoch + 1, time.time() - start))
        y_raw = []
        all_predictions = []
        testing_accuracy = 0

        with torch.no_grad():
            for (batch, (batch_input, batch_target, batch_length)) in enumerate(testing_dataset):
                    
                predictions, _ = model(batch_input.permute(1, 0).to(device), device)
                batch_accuracy = accuracy(batch_target.to(device), predictions)
                testing_accuracy += batch_accuracy
                
                all_predictions = all_predictions + [i.item() for i in torch.max(predictions, 1)[1]]
                y_raw = y_raw + [y.item() for y in torch.max(batch_target, 1)[1]]
        acc = (testing_accuracy.cpu().detach().numpy() / TESTING_BATCH_COUNT)
        trial.report(acc, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    return acc
  
 # Define a set of hyperparameter values, build the model, train the model, and evaluate the accuracy 
def objective(trial):

     params = {
              'learning_rate': trial.suggest_float('learning_rate', 8e-4, 3e-3),
              'optimizer': trial.suggest_categorical("optimizer", ["Adam", "AdamW", "RMSprop"]),
              'embed_dim': trial.suggest_int("embed_dim", 128, 256),
              'n_unit': trial.suggest_int("n_unit", 512, 1024),
              'dropout': trial.suggest_float("dropout", 0.2, 0.6),
              'n_epochs': trial.suggest_int("n_epochs", 3, 8)
              }
    
     accuracy = train_and_evaluate(params, trial)     
     return accuracy

In [16]:
import pso_sampler
import harmony_sampler
import simulated_annealing
from optuna.samplers import RandomSampler
import imp
imp.reload(harmony_sampler)
imp.reload(pso_sampler)
from optuna.samplers import NSGAIISampler

  import imp


In [26]:
study = optuna.create_study(direction="maximize", sampler=harmony_sampler.HarmonySearchSampler(harmony_memory_size=8, max_iter_size=30), pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=30)

[32m[I 2022-11-27 15:02:30,619][0m A new study created in memory with name: no-name-6b6378c6-aa3d-4e6a-8bea-613953086b54[0m
[32m[I 2022-11-27 15:04:28,483][0m Trial 0 finished with value: 42.1875 and parameters: {'learning_rate': 0.002758367675107699, 'optimizer': 'Adam', 'embed_dim': 245, 'n_unit': 795, 'dropout': 0.22756855502173812, 'n_epochs': 7}. Best is trial 0 with value: 42.1875.[0m
[32m[I 2022-11-27 15:05:51,858][0m Trial 1 finished with value: 62.8125 and parameters: {'learning_rate': 0.0021491055752185877, 'optimizer': 'Adam', 'embed_dim': 133, 'n_unit': 613, 'dropout': 0.22636265383893325, 'n_epochs': 7}. Best is trial 1 with value: 62.8125.[0m
[32m[I 2022-11-27 15:08:20,733][0m Trial 2 finished with value: 41.354166666666664 and parameters: {'learning_rate': 0.002767452758784254, 'optimizer': 'Adam', 'embed_dim': 245, 'n_unit': 796, 'dropout': 0.3129503148407853, 'n_epochs': 8}. Best is trial 1 with value: 62.8125.[0m
[32m[I 2022-11-27 15:08:55,938][0m Trial 

In [27]:
print(study.best_trial.value) 

95.0


In [28]:
best_trial = study.best_trial

for key, value in best_trial.params.items():
    print("{}: {}".format(key, value))


# TPE  Best is trial 20 with value: 96.04166666666667
# learning_rate: 0.0015880201650297686
# optimizer: RMSprop
# embed_dim: 207
# n_unit: 946
# dropout: 0.29921070372441333
# n_epochs: 6

# NSGA Best is trial 12 one with value: 96.35416666666667
# learning_rate: 0.0014040990367376422
# optimizer: RMSprop
# embed_dim: 243
# n_unit: 865
# dropout: 0.23547689192876708
# n_epochs: 8


# Random Best is trial 6 with value: 95.52083333333333
# learning_rate: 0.0014895147959046798
# optimizer: RMSprop
# embed_dim: 169
# n_unit: 814
# dropout: 0.4548005138131388
# n_epochs: 8

# simulated annealing  Best is trial 1 with value: 95.0
# learning_rate: 0.0019648218967336026
# optimizer: RMSprop
# embed_dim: 244
# n_unit: 662
# dropout: 0.4011807127959925
# n_epochs: 5

# PSO 12 samples Best is trial 7 with value: 95.10416666666667.
# learning_rate: 0.0017161575292890664
# optimizer: RMSprop
# embed_dim: 172
# n_unit: 584
# dropout: 0.23473732724053883
# n_epochs: 3

# Harmony Best is trial 26 with value: 95.0
# learning_rate: 0.0017372727078855696
# optimizer: RMSprop
# embed_dim: 196
# n_unit: 533
# dropout: 0.33329331620615743
# n_epochs: 3

learning_rate: 0.0017372727078855696
optimizer: RMSprop
embed_dim: 196
n_unit: 533
dropout: 0.33329331620615743
n_epochs: 3


In [34]:
study.trials_dataframe().to_csv('gru_results/harmony_30_trials_11_25_w_pruning.csv', index=False)

## Tuning Visualization

In [30]:
# algo = 'tpe'
figure = optuna.visualization.plot_intermediate_values(study)
# figure.write_image(f'dnn_results/plots/{algo}_30_trials_w_pruning.png')
# tpe_30_trials_w_pruning_intermed_values
# nsga_30_trials_w_pruning_intermed_values
# pso_30_trials_w_pruning_intermed_values
# harmony_30_trials_w_pruning_intermed_values
# siman_30_trials_w_pruning_intermed_values
# rnd_30_trials_w_pruning_intermed_values

figure

In [31]:
optuna.visualization.plot_optimization_history(study)
# tpe_30_trials_w_pruning_opt_history
# nsga_30_trials_w_pruning_opt_history
# pso_30_trials_w_pruning_opt_history
# harmony_30_trials_w_pruning_opt_history
# siman_30_trials_w_pruning_opt_history
# rnd_30_trials_w_pruning_opt_history


In [32]:
optuna.visualization.plot_parallel_coordinate(study)
# tpe_30_trials_w_pruning_parall_coord
# nsga_30_trials_w_pruning_parall_coord
# pso_30_trials_w_pruning_parall_coord
# harmony_30_trials_w_pruning_parall_coord
# siman_30_trials_w_pruning_parall_coord
# rnd_30_trials_w_pruning_parall_coord


In [33]:
optuna.visualization.plot_param_importances(study)
# tpe_30_trials_w_pruning_param_imp
# nsga_30_trials_w_pruning_param_imp
# pso_30_trials_w_pruning_param_imp
# harmony_30_trials_w_pruning_param_imp
# siman_30_trials_w_pruning_param_imp
# rnd_30_trials_w_pruning_param_imp
