In [None]:
!pip install wandb -qU

In [None]:
import wandb
wandb.login(key="aec6fef7ba56ee445129472eb583718b8e529934")

# Config testing

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import pandas as pd
import torchvision
from torchvision import datasets, models, transforms
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import time
import random
import pickle
import os
import copy
import sys
from tqdm import tqdm
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, ConfusionMatrixDisplay, precision_score, recall_score, roc_curve, auc
import wandb

In [None]:
config = {
  "lr": 1e-3,
  "weight_decay": 1e-4,
  "epoch": 100
}
tags = ["new_data", "10percent", "BERT-Pretrained"]
run = wandb.init(entity='sentimovie', project="Bert-finetune", config=config, tags=tags)

In [None]:
#######################################
###           wandb logging         ###
#######################################
##           Log Some data           ##
#######################################
tmp = {
    "sample_data": ["I think it was an interesting and thought-provoking movie.", "It was a silly and silly movie."], 
    "sample_label": ["Fresh", "Rotten"]}
wandb.log(tmp)
#######################################
#######################################

In [None]:
#######################################
###           wandb logging         ###
#######################################
##            Log data size          ##
#######################################
tmp = {
    "train_size": len(dataset_sizes['train']),
    "val_size": len(dataset_sizes['val']),
    "test_size": len(dataset_sizes['test']),
    }
wandb.log(tmp)
#######################################
#######################################

In [None]:
wandb.config['lr']

0.001

In [None]:
def deflog(text):
    print(f"[DEF-LOG] {text}")

In [None]:
def train_model(model, criterion, optimizer, scheduler, dataloaders, 
                dataset_sizes, device, config, earlyStopper, num_epochs=25):
    since = time.time()
    history = {'train_loss': [], 'val_loss': [], 
                    'train_acc': [], 'val_acc': []}
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in tqdm(range(num_epochs)):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                optimizer.zero_grad()
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            history[f'{phase}_loss'].append(epoch_loss)
            history[f'{phase}_acc'].append(float(epoch_acc.cpu().numpy()))

            # deflog(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
            
            #######################################
            ###           wandb logging         ###
            #######################################
            ##        Log the training flow       ##
            #######################################
            tmp = {f'{phase}_loss': epoch_loss, 
                    f'{phase}_acc': epoch_acc,
                    'lr': config['lr'],
                    'epoch': epoch}
            wandb.log(tmp)
            #######################################
            #######################################

            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
        
        if earlyStopper.early_stop(history['val_loss'][-1]):
            deflog("Early Stopping")
            break

    time_elapsed = time.time() - since
    deflog(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    deflog(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, history

In [None]:
def save_results(model, history, base_path="output"):
    history_file_path = f"{base_path}/history.pkl"
    with open(history_file_path, 'wb') as handle:
        pickle.dump(history, handle, protocol=pickle.HIGHEST_PROTOCOL)
    deflog(f'history has been saved in \"{history_file_path}\"')

    model_file_path = f"{base_path}/model.pt"
    torch.save(model.state_dict(), model_file_path)
    deflog(f'model has been saved in \"{model_file_path}\"')

In [None]:
def testing(model, dataloader, device):
    y_pred = []
    y_true = []
    model.eval()
    with torch.no_grad():
        for inputs, labels in tqdm(dataloader):
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
    return y_pred, y_true

In [None]:
y_pred, y_true = testing(model, dataloaders['test'], device)

f1 = f1_score(y_true, y_pred, average='macro')
fpr, tpr, thresholds = roc_curve(y_true, y_pred)
AUC = auc(fpr, tpr)
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
acc = accuracy_score(y_true, y_pred)
deflog("Metric calculated")
wandb.sklearn.plot_confusion_matrix(y_true, 
                                    y_pred, 
                                    ['NORM', 'TUM'])
deflog("Send confusion matrix")
tmp = {
    "test_acc": acc, 
    "test_f1": f1, 
    "test_auc": AUC, 
    "test_precision": precision, 
    "test_recall": recall
}
wandb.log(tmp)
deflog("Run finished")

# Hyper-parameter testing

In [None]:
import numpy as np
import pandas as pd
import pickle


class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        else:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False


class myFC(nn.Module):
    def __init__(self, input_dim=1024, hidden_dim=[2048, 2048], class_num=2):
        super(myFC, self).__init__()
        self.fc1 = torch.nn.Sequential(
            nn.Linear(input_dim, hidden_dim[0]),
            nn.LeakyReLU(),
            nn.Linear(hidden_dim[0], hidden_dim[1]),
            nn.LeakyReLU(),
            torch.nn.Linear(hidden_dim[1], class_num)
        )
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.softmax(x)
        return x


class MyDataset(Dataset):
    def __init__(self, X_data, labels):
        self.embeddings = X_data
        self.labels = np.unique(labels, return_inverse=True)[1]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        embedding_vector = torch.tensor(self.embeddings[idx])
        label_onehot = torch.tensor(self.labels[idx])
        return embedding_vector, label_onehot



def read_data_movies(input_path):
    train_movie_path = f"{input_path}/train.csv"
    val_movie_path = f"{input_path}/val.csv"
    test_movie_path = f"{input_path}/test.csv"

    def read_movie(path):
        d = pd.read_csv(path)
        return d[['movie_title', 'review_type', 'review_content', 'my_score']]

    movies_raw_data = {"train": None, "val": None, "test": None}
    movies_raw_data['train'] = read_movie(train_movie_path)
    movies_raw_data['val'] = read_movie(train_movie_path)
    movies_raw_data['test'] = read_movie(train_movie_path)
    return movies_raw_data


def read_data_reviews(input_path, load_augmented_train_set):
    train_review_path = f"{input_path}/train.pkl"
    train_aug_review_path = f"{input_path}/train_aug.pkl"
    val_review_path = f"{input_path}/val.pkl"
    test_review_path = f"{input_path}/test.pkl"

    if load_augmented_train_set:
        train_review_path = train_aug_review_path

    reviews_raw_data = {"train": None, "val": None, "test": None}
    with open(train_review_path, 'rb') as f:
        reviews_raw_data['train'] = pickle.load(f)
    with open(val_review_path, 'rb') as f:
        reviews_raw_data['val'] = pickle.load(f)
    with open(test_review_path, 'rb') as f:
        reviews_raw_data['test'] = pickle.load(f)
    return reviews_raw_data


def dataset_loader(data, config):
    dataset = {'train': MyDataset(data['train']['x'], data['train']['y']),
              'val': MyDataset(data['val']['x'], data['val']['y']),
              'test': MyDataset(data['test']['x'], data['test']['y'])}
    deflog("Datasets created")

    dataloaders = {x: DataLoader(dataset[x], batch_size=config['batch_size_l'], 
                                 num_workers=2) for x in ['train', 'val', 'test']}
    deflog("Dataloaders created")
    dataset_sizes = {x: len(dataset[x]) for x in ['train', 'val', 'test']}
    deflog("Dataset_sizes created")
    return dataloaders, dataset, dataset_sizes



def run_model(config=None):
    global ex_num, total_ex_num, proj_name, tags
    with wandb.init(entity='sentimovie', project=proj_name, config=config, tags=tags):
        
        print("="*10)
        ex_num += 1
        deflog(f"[Ex. {ex_num}/{total_ex_num}]")
        print("="*10)
        
        # read data
        data = read_data_reviews(wandb.config['input_path'], 
                                 wandb.config['load_augmented_train_set'])
        # dataset and dataloader
        dataloaders, dataset, dataset_sizes = dataset_loader(data, config)

        # model
        model = myFC(class_num=2)
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model.to(device)
        deflog(f"ML model defined (device: {device})")

        earlyStopper = EarlyStopper(patience=10, min_delta=1e-2)
        deflog("EarlyStopper defined")

        class_weights = compute_class_weight('balanced', 
                                            classes=np.unique(data['train']['y']), 
                                            y=np.array(data['train']['y']))
        class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
        criterion = nn.CrossEntropyLoss(weight=class_weights)
        deflog("Define loss class_weights")

        optimizer = optim.Adam(model.parameters(), lr=wandb.config['lr'], weight_decay=wandb.config['weight_decay'])
        deflog("Optimizer (Adam) defined")

        cosin_lr_scheduler = lr_scheduler.CosineAnnealingLR(optimizer, 
                                                            T_max=wandb.config['epoch_num'], 
                                                            eta_min=5e-6)
        deflog("Scheduler (cosin_lr_scheduler) defined")

        model, history = train_model(model, criterion, optimizer, cosin_lr_scheduler, 
                                     dataloaders, dataset_sizes, device, wandb.config,
                                     earlyStopper, wandb.config['epoch_num'])
        
        save_results(model, history, base_path=f"{wandb.config['output_path']}")


def main(input_path, output_path):
    global ex_num, total_ex_num, proj_name
    print("Main method running ...")
    
    sweep_config = {'method': 'random'}
    # sweep_config = {'method': 'grid'}
    # sweep_config = {'method': 'bayes'}

    metric = {'name': 'val_acc', 
            'goal': 'maximize'}

    sweep_config['metric'] = metric

    hyper_parameters = {
        'lr': { 'values': [float(f"1e-{i}") for i in range(2,7)]},
        'weight_decay': { 'values': [float(f"1e-{i}") for i in range(2,7)]},
        'load_augmented_train_set': {'value': True},
        'epoch_num': {'value': 60},
        'input_path': {'value': input_path},
        'output_path': {'value': output_path}
    }
    sweep_config['parameters'] = hyper_parameters
    sweep_id = wandb.sweep(sweep_config, project=proj_name, )
    wandb.agent(sweep_id, run_model, count=total_ex_num)


ex_num = 0
total_ex_num = 30
hp_num = 0
proj_name = "BERT"
tags = ["Sample1", "Sample2"]

if __name__ == "__main__":
    os.chdir(f"{sys.argv[1]}")
    main(sys.argv[2], sys.argv[3])