In [1]:
from torch.optim import Adam
from tqdm import tqdm
from torch import nn
from transformers import BertModel
import torch
import numpy as np
from transformers import BertTokenizer
import os
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

device = "cuda" if torch.cuda.is_available() else "cpu"

# Global Variables

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
max_length = 512

training = False
pre_trained = False
model_epoch = 10 #choose model
test_evaluation = False
perform = True

labels = {'asimov':0,
          'non_asimov':1
          }
print(labels['asimov'])
print(labels['non_asimov'])
keys_list = list(labels.keys())
print(keys_list[0])
print(keys_list[1])

large_data = True   #states if the data used is the enlarged one
large = 'larger_' if large_data else ''

# Dataset

In [3]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, max_length=512, large=large): #,df

        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        self.asimov_path = f'{large}data/{large}asimov_sentence_dataset.txt'
        self.non_asimov_path = f'{large}data/{large}non_asimov_sentence_dataset.txt'
        self.dic = {}
        self.tot_input = ''
        self.input_len = []
        self.texts = []
        self.token_len = 2 #[CLS] and [SEP]
        #two different dataset
        for path, label in [(self.asimov_path, 'asimov'), (self.non_asimov_path, 'non_asimov')]:
            self.iterative_process(path) #add text, text_len
            if label == 'asimov':
                print(label)
                self.labels = [labels[label] for i in range(self.text_count)]
                partial_count = self.text_count
            else:
                print(label)
                for i in range(self.text_count - partial_count):
                    self.labels.append(labels[label])
        
    def iterative_process(self, path):
        with open(path, 'r', encoding="utf-8") as data:
            data_list = data.readlines()
            for i,line in enumerate(data_list):
                temp = line
                curr_token = self.tokenizer(temp, return_tensors="pt").to(device)
                len_curr_token = len(curr_token['attention_mask'][0]) - 2 #ignore [CLS] and [SEP]
                if len_curr_token > self.max_length:
                    print(len(curr_token['attention_mask'][0]))
                self.token_len += len_curr_token
                #print(self.token_len)
                if self.token_len < (self.max_length - 1):    #I can add the next line
                    self.tot_input += temp                    #if we can keep adding we iterate for the next line
                    if i == len(data_list)-1:                 #we are at the final line, so we need to add it even if it has not reached the maximum length
                        #no need to reset the token len since the dataset is finished
                        token = self.tokenizer(self.tot_input, 
                                            padding='max_length', max_length = max_length,
                                            return_tensors="pt").to(device)
                        self.texts.append(token)
                        if len(token['attention_mask'][0]) > 512:
                            print('last adding')
                            print(len(token['attention_mask'][0]))
                            print(tokenizer.decode(token['input_ids'][0]))
                else:   #I can not add other lines, so this is the one I add to the tokenizer
                    token = self.tokenizer(self.tot_input, 
                                            padding='max_length', max_length = max_length,
                                            return_tensors="pt").to(device)
                    if len(token['attention_mask'][0]) > 512:
                        print('normal adding')
                        print(len(token['attention_mask'][0]))
                        print(tokenizer.decode(token['input_ids'][0]))

                    self.texts.append(token)
                    self.input_len.append(len(self.tot_input))
                    if self.tot_input != temp:
                        self.tot_input = temp            #reset tot_input for the next iteration
                        self.token_len = len_curr_token  #reset the token len since we are starting over a next sequence 
                    else: #if the current sequence is too long I discard it
                        self.tot_input = ''
                        self.token_len = 2
            self.text_count = len(self.texts)

    
    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [4]:
class Subset(Dataset):

    def __init__(self, dataset, indices) -> None:
        self.dataset = dataset
        self.indices = indices
        # the new labels are the one with the index in the list indices
        self.labels = [self.dataset.labels[i] for i in indices]
        self.texts = [self.dataset.texts[i] for i in indices]

    def __getitem__(self, idx):
        if isinstance(idx, list):
            return self.dataset[[self.indices[i] for i in idx]]
        return self.dataset[self.indices[idx]]

    def __len__(self):
        return len(self.indices)

## Creation and split of the dataset

In [5]:
TEST_SIZE = 0.1
VAL_SIZE = 0.11111111111
BATCH_SIZE = 2
SEED = 42
if training or test_evaluation:

    data = Dataset()
    

    X = range(len(data))
    y = data.labels

    # generate indices: instead of the actual data we pass in integers
    train_indices, test_indices, _, _ = train_test_split(X, y, stratify=y, test_size=TEST_SIZE, random_state=SEED)

    mid_train_split = Subset(data, train_indices)

    new_X = range(len(mid_train_split))
    new_y = mid_train_split.labels

    train_indices, val_indices, _, _ = train_test_split(new_X, new_y, stratify=new_y, test_size=VAL_SIZE, random_state=SEED)

    # generate subset based on indices
    train_split = Subset(data, train_indices)
    val_split = Subset(data, val_indices)
    test_split = Subset(data, test_indices)

    # visualize the dimension of the dataset
    print(len(data))
    print(len(train_split))
    print(len(val_split))
    print(len(test_split))
    print('############')
    print(len(data)/len(data))
    print(len(train_split)/len(data))
    print(len(val_split)/len(data))
    print(len(test_split)/len(data))

In [6]:
if training or test_evaluation:
    with open(f'performance/data/{large}info.txt', 'w') as f:
        asimov = 0
        non_asimov = 0

        for i in range(len(data)):
            if data.labels[i] == 0:
                asimov+=1
            else:
                non_asimov+=1

        print(f'Samples in all data: asimov are {asimov}, non asimov are {non_asimov}')
        a_per = (asimov/len(data))*100
        n_a_per = (non_asimov/len(data))*100
        print(f'So in data we have {a_per: .2f}% of asimov samples and {n_a_per: .2f}%')

        f.write(f'Samples in all data: asimov are {asimov}, non asimov are {non_asimov}\n')
        f.write(f'So in data we have {a_per: .2f}% of asimov samples and {n_a_per: .2f}% of non_asimov\n')

        asimov=0
        non_asimov=0
        for i in range(len(train_split)):
            if train_split.labels[i] == 0:
                asimov+=1
            else:
                non_asimov+=1

        print(f'Samples in training data: asimov are {asimov}, non asimov are {non_asimov}')
        a_t_per = (asimov/len(train_split))*100
        n_a_t_per = (non_asimov/len(train_split))*100
        print(f'So in train set we have {a_t_per: .2f}% of asimov samples and {n_a_t_per: .2f}% of non_asimov')

        f.write(f'Samples in training data: asimov are {asimov}, non asimov are {non_asimov}\n')
        f.write(f'So in train set we have {a_t_per: .2f}% of asimov samples and {n_a_t_per: .2f}% of non_asimov\n')

        asimov=0
        non_asimov=0
        for i in range(len(val_split)):
            if val_split.labels[i] == 0:
                asimov+=1
            else:
                non_asimov+=1

        print(f'Samples in validation data: asimov are {asimov}, non asimov are {non_asimov}')
        a_v_per = (asimov/len(val_split))*100
        n_a_v_per = (non_asimov/len(val_split))*100
        print(f'So in validation set we have {a_v_per: .2f}% of asimov samples and {n_a_v_per: .2f}% of non_asimov')

        f.write(f'Samples in validation data: asimov are {asimov}, non asimov are {non_asimov}\n')
        f.write(f'So in validation set we have {a_v_per: .2f}% of asimov samples and {n_a_v_per: .2f}% of non_asimov\n')


        asimov=0
        non_asimov=0
        for i in range(len(test_split)):
            if test_split.labels[i] == 0:
                asimov+=1
            else:
                non_asimov+=1

        print(f'Samples in test data: asimov are {asimov}, non asimov are {non_asimov}')
        a_test_per = (asimov/len(test_split))*100
        n_a_test_per = (non_asimov/len(test_split))*100
        print(f'So in test set we have {a_test_per: .2f}% of asimov samples and {n_a_test_per: .2f}% of non_asimov')

        f.write(f'Samples in test data: asimov are {asimov}, non asimov are {non_asimov}\n')
        f.write(f'So in test set we have {a_test_per: .2f}% of asimov samples and {n_a_test_per: .2f}% of non_asimov\n')



# Initialize Model

In [7]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

# Train

In [8]:
def train(model, train_data, val_data, learning_rate, epochs, batch_size=BATCH_SIZE, large=large, pre=pre_trained, st_ep=model_epoch):

    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=True)

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    model = model.to(device)
    criterion = criterion.to(device)

    for epoch_num in range(epochs):
            
            if pre:
                epoch_num += st_ep

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            to_print = \
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                    (total_acc_train: {total_acc_train}) \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}\
                    (total_acc_val: {total_acc_val})'
            print(to_print)
            with open(f'{large}classifier_model/acc_{epoch_num+1}.txt', 'w') as f:
                f.write(to_print)

            if (epoch_num+1)%2==0:
                torch.save(model.state_dict(), os.path.join(f'{large}classifier_model', f"{large}classifier-8_1_1-{epoch_num+1}.pt"),)
                  


In [None]:
EPOCHS = 20
LR = 1e-6

model = BertClassifier()
if training:
    if pre_trained:
        model.load_state_dict(torch.load(f'{large}classifier_model/{large}classifier-8_1_1-{model_epoch}.pt'))
        EPOCHS -= model_epoch
    train(model, train_split, val_split, LR, EPOCHS)

# Evaluate

In [10]:
def evaluate(model, test_data, large=large):

    test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    TP=0
    TN=0
    FP=0
    FN=0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            pred = output.argmax(dim=1)

            for i in range(len(test_label)):
                if test_label[i].item() == 1: #non_asimov
                    TN+=1
                    if pred[i].item() != 1: #predicted asimov instead of non_asimov
                        FP+=1
                else: #asimov
                    TP+=1
                    if pred[i].item() != 0: #predicted non_asimov instead of asimov
                        FN+=1
        
        #compute performances
        Errors = FP+FN
        Accuracy = (TP+TN)/(TP+FP+FN+TN)
        Precision = TP/(TP+FP)
        Recall = TP/(TP+FN)
        F1_score = 2*((Recall * Precision) / (Recall + Precision))
    
    print(f'Test Accuracy: {Accuracy: .3f}')
    print(f'Test Precision: {Precision: .3f}')
    print(f'Test Recall: {Recall: .3f}')
    print(f'Test F1_score: {F1_score: .3f}')
    print(f'Other info:')
    print(f'False Positives (predicted asimov instead of non_asimov): {FP}')
    print(f'False Negatives (predicted non_asimov instead of asimov): {FN}')
    print(f'Errors: {Errors}')
    with open(f'performance/{large}info.txt', 'a') as f:
        f.write(f'Test Accuracy: {Accuracy: .3f}\n')
        f.write(f'Test Precision: {Precision: .3f}\n')
        f.write(f'Test Recall: {Recall: .3f}\n')
        f.write(f'Test F1_score: {F1_score: .3f}\n')
        f.write(f'Other info: \n')
        f.write(f'False Positives (predicted asimov instead of non_asimov): {FP}\n')
        f.write(f'False Negatives (predicted non_asimov instead of asimov): {FN}\n')
        f.write(f'Errors: {Errors}')

if training:
    evaluate(model, test_split)
    
elif test_evaluation:
    model.load_state_dict(torch.load(f'{large}classifier_model/{large}classifier-8_1_1-4.pt'))
    model.to(device)
    evaluate(model, test_split)

# Test of Performances

In [11]:
if not training or not test_evaluation:
    #load the model
    model.load_state_dict(torch.load(f'{large}classifier_model/{large}classifier-8_1_1-{model_epoch}.pt'))
    model.to(device)

## Clean Generated Text

In [12]:
def clean_text_for_evaluation(generated_text, max_length=512):
    model_input = []
    for i in range(len(generated_text)):
        token = tokenizer(generated_text[i], 
                padding='max_length', max_length = max_length,
                return_tensors="pt")
        if len(token['input_ids'][0]) > max_length:
            temp_array = np.array(token['input_ids'][0])[:max_length-1]
            temp = tokenizer.decode(temp_array)

            last_occ1 = temp.rfind('.')
            last_occ2 = temp.rfind('!')
            last_occ3 = temp.rfind('?')

            temp = temp[6:max(last_occ1,last_occ2,last_occ3)+1]
            cleaned_token = tokenizer(temp, 
                padding='max_length', max_length = max_length,
                return_tensors="pt")
            model_input.append(cleaned_token)
        else:
            model_input.append(token)
    return model_input

## Tokenize and Classify

In [None]:
if perform:
    directory = f'{large}model_outputs'
    base_output = os.listdir(directory)
    tp = 0
    tn = 0
    all_stories = []
    classes = []
    for output in base_output:
        with open(f'{directory}/{output}', 'r') as f:
            text_input = f.read()
        text_list = text_input.split('\n\n')
        for story in text_list:
            if 'baseline' in output:
                flag = 'non_'
                tn += 1
            else:
                flag = ''
                tp += 1
            all_stories.append(story)
            classes.append(f'{flag}asimov')

    model_input = clean_text_for_evaluation(all_stories)

    output_classification = []
    for test_input in tqdm(model_input):
        mask = test_input['attention_mask'].to(device)
        input_id = test_input['input_ids'].squeeze(1).to(device)
        output = model(input_id, mask).to(device)
        output_classification.append(keys_list[output.argmax(dim=1)])

## Compute performance

In [None]:
if perform:
    fn = 0
    fp = 0
    for i in range(len(output_classification)):
        if output_classification[i] != classes[i]:
            if classes[i] == 'non_asimov': #should be classified as negative but is falsly positive
                fp+=1
            else: #should be classified as positive but is falsly negative
                fn+=1
    
    #check
    print(f'false negatives (non asimov classified as asimov):{fn}')
    print(f'false positives (asimov classified as non asimov):{fp}')
    errors = fp + fn
    print(f'errors: {errors}')

    accuracy = (len(output_classification)-errors)/len(output_classification)
    print('accuracy:', accuracy)

    # precision =  TruePositives / (TruePositives + FalsePositives)
    precision = tp / (tp + fp)
    # recall = TruePositives / (TruePositives + FalseNegatives)
    recall = tp / (tp + fn)

    f1_score = 2*((precision*recall)/(precision+recall))

    print('precision', precision)
    print('recall', recall)
    print('f1_score', f1_score)

    print(large)

## Save Performance

In [15]:
if perform:
    with open(f'performance/{large}version/Alan woke up.txt', 'w') as f:
        f.write(f'accuracy: {accuracy} \n')
        f.write(f'precision: {precision} \n')
        f.write(f'recall: {recall} \n')
        f.write(f'f1_score: {f1_score} \n')
        f.write(f'Other info: \n')
        f.write(f'False negatives (non asimov classified as asimov): {fn} \n')
        f.write(f'False positives (asimov classified as non asimov): {fp} \n')
        f.write(f'errors: {errors}')