# Human Value Detection

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

from utilities import *
from models.bertOne import BertOne

from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix
from models.randomUniformClassifier import RandomUniformClassifier
from models.majorityCalssifier import MajorityClassifier

from transformers import AutoTokenizer

from drTorch.callbacks import EarlyStopper

from drTorch.metrics import F1_Score
from drTorch.utilities import *
from drTorch.wrappers import OptimizerWrapper
from drTorch.wrappers import Criterion

import numpy as np
import torch
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
print('Device: %s' % device)


Device: cpu


## Defining constants and flags

In [3]:
# PATHS AND DATAFRAME CREATION
DATA_DIR = "data"
ARGUMENTS_DIR = os.path.join(DATA_DIR, "arguments")
LABELS_DIR = os.path.join(DATA_DIR, "labels")

# CONSTANTS 
N_LABELS = 4
N_CLASSES = 2
BATCH_SIZE = 32
#os.environ["TOKENIZERS_PARALLELISM"] = "true"

CLASS_2_ONE_HOT = {class_label: np.eye(N_CLASSES)[i].astype(float).tolist() for i, class_label in enumerate(range(N_CLASSES))}


## Task 1

### Visualizing the data

In [4]:
# convert files in dataframes
train_arg_df, val_arg_df, test_arg_df = create_dfs(ARGUMENTS_DIR)
train_labels_df, val_labels_df, test_labels_df = create_dfs(LABELS_DIR)

print("Let's visualize the data: ")
display(train_arg_df.head(5))
display(train_labels_df.head(5))


Let's visualize the data: 


Unnamed: 0_level_0,Conclusion,Stance,Premise
Argument ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...
A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...
A01006,We should end the use of economic sanctions,against,sometimes economic sanctions are the only thin...
A01007,We should abolish capital punishment,against,capital punishment is sometimes the only optio...
A01008,We should ban factory farming,against,factory farming allows for the production of c...


Unnamed: 0_level_0,Self-direction: thought,Self-direction: action,Stimulation,Hedonism,Achievement,Power: dominance,Power: resources,Face,Security: personal,Security: societal,Tradition,Conformity: rules,Conformity: interpersonal,Humility,Benevolence: caring,Benevolence: dependability,Universalism: concern,Universalism: nature,Universalism: tolerance,Universalism: objectivity
Argument ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A01002,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
A01005,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
A01006,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
A01007,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0
A01008,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0


### Mapping labels to level-3 categories

In [5]:
mapping = define_mapping()
train_labels_df, val_labels_df, test_labels_df = map_to_level_3(mapping, train_labels_df, val_labels_df, test_labels_df) 

print("The training labels after the mapping are the following: ")
train_labels_df


The training labels after the mapping are the following: 


Unnamed: 0_level_0,Openess_to_change,Self_enhancement,Conservation,Self_transcendence
Argument ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A01002,0,0,1,0
A01005,0,0,1,0
A01006,0,1,1,0
A01007,0,0,1,0
A01008,0,0,1,1
...,...,...,...,...
E08016,0,1,1,1
E08017,0,0,1,1
E08018,0,0,0,1
E08019,0,0,1,1


### One-hot encoding, tokenization and data loaders building

In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
columns_to_consider = ["Conclusion", "Premise"]

train_labels_tensor = torch.tensor([[CLASS_2_ONE_HOT[element] for element in row] for row in train_labels_df.values])
val_labels_tensor = torch.tensor([[CLASS_2_ONE_HOT[element] for element in row] for row in val_labels_df.values])
test_labels_tensor = torch.tensor([[CLASS_2_ONE_HOT[element] for element in row] for row in test_labels_df.values])

max_length = calculate_max_length(train_arg_df, columns_to_consider, tokenizer)

train_arg_df = encode(train_arg_df, tokenizer, max_length, columns_to_consider)
val_arg_df = encode(val_arg_df, tokenizer, max_length, columns_to_consider)
test_arg_df = encode(test_arg_df, tokenizer, max_length, columns_to_consider)

In [7]:

train_loader_C = get_data_loader_test(batch_size=BATCH_SIZE,
                                      shuffle=True,
                                      data=train_arg_df.loc[:, "Conclusion"],
                                      labels=train_labels_tensor)

val_loader_C = get_data_loader_test(batch_size=BATCH_SIZE,
                                    shuffle=True,
                                    data=val_arg_df.loc[:, "Conclusion"],
                                    labels=val_labels_tensor)

# todo aggiungi gli altri data loader (CS and CSP), capire come passarli al modello senza fare casini


## Task 2 

### Models Definition

####  1) Random uniform classifier

In [30]:
# Create an instance of the random uniform classifier
random_classifier = RandomUniformClassifier(N_LABELS)

# Make predictions on the test set
y_pred = random_classifier.predict(test_arg_df)

# Accuracy of the Random Classifier
accuracy = accuracy_score(test_labels_df, y_pred)
print(f'Accuracy of the model over all the classes: {accuracy}\n')

# average F1 on the different column singularly taken 
f1 = avg_f1_score(test_labels_df, y_pred)
print(f'Average F1 on the different column singularly taken: {f1}')


Accuracy of the model over all the classes: 0.06281725888324872

f1 on column 0:  0.48912632331231587
f1 on column 1:  0.4879916831136344
f1 on column 2:  0.47909039134325926
f1 on column 3:  0.5095017705475109
Average F1 on the different column singularly taken: 0.4914275420791801


####  1) Majority classifier

In [29]:
majority_classifier = MajorityClassifier()

# Train the majority classifier (even though in practice, no training is needed)
majority_classifier.fit(train_labels_df)

# Make predictions on the test set
y_pred = majority_classifier.predict(test_labels_df)

# Accuracy of the Majority Classifier
accuracy = accuracy_score(test_labels_df, y_pred)
print(f'Accuracy of the model over all the classes: {accuracy}\n')

# Average F1 on the different column singularly taken 
f1 = avg_f1_score(test_labels_df, y_pred)
print(f'Average F1 on the different column singularly taken: {f1}')


Accuracy of the model over all the classes: 0.13642131979695432

f1 on column 0:  0.41150112023898433
f1 on column 1:  0.3708582834331337
f1 on column 2:  0.4152133580705009
f1 on column 3:  0.32850447379633574
Average F1 on the different column singularly taken: 0.3815193088847386


####  3) Bert

In [12]:
optimizer_test = OptimizerWrapper(torch.optim.Adam, identifier=f'lr={10}', optimizer_partial_params={'lr':10})
criterion_test = Criterion('loss', loss_function=torch.nn.BCELoss(reduction='none'), reduction_function=torch.mean)


In [13]:
bert1 = BertOne().to(device)


In [14]:
#todo: il fit non funziona perchè va modificata la loss dalla classe Criterion in modo tale da renderla compatibile con le tuple e non solo con i tensori

bert1_history = bert1.fit(train_loader=train_loader_C, 
                          val_loader=val_loader_C, 
                          criterion=criterion_test, 
                          metrics=[F1_Score('F1_macro', N_LABELS, mode='macro')], 
                          optimizer=optimizer_test,
                          early_stopper=EarlyStopper(monitor='F1_macro', patience=4, delta=0, mode='max', restore_weights=True),
                          num_epochs=200)


 Epoch: 1/200 Iterations: 10/169 - loss: 41.40625 - F1_macro: 0.28031579604318796981921156

KeyboardInterrupt: 

In [15]:
plot_history(bert1_history)

NameError: name 'bert1_history' is not defined

In [16]:
### Test sul modello

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')


model = transformers.BertModel.from_pretrained("bert-base-uncased")
output = model(**encoded_input)


p_d = BertOne()
output1 = p_d(**encoded_input)
