In [None]:
import numpy as np
import torch


import json
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch import nn

from statistics import mean
import pickle

In [None]:
import utils_generic as generic
import rrnn_modelo as rrnn

import train_datamaps as train 
from train import eval_func_multi

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
tasks = ['to','as','about']

In [None]:
encoding_type = 'brk'
model_name = 'rnn_multitask'
num_labels = 2

# Datos ConvAI2

In [None]:
with open('Datasets\ConvAI2\convai2_complete.json','r') as f:
    data = json.load(f)

In [None]:
# Cargo clase vocabulary
with open(f'vocab_{encoding_type}.pkl','rb') as f:
    vocab = pickle.load(f)

convai_train = data['train']
convai_val = data['validation']


word_to_index = generic.create_word_to_index(convai_train)

convai_train_token = generic.tokenize_dataset_with_dependencies_rrnn(convai_train,['about','to','as'],vocab,word_to_index) 
convai_val_token = generic.tokenize_dataset_with_dependencies_rrnn(convai_val,['about','to','as'],vocab,word_to_index) 

In [None]:
convai_train_dataset =rrnn.DatasetMultiTaskRRNN(convai_train_token,tasks,eval=False,deps=True)

In [None]:
convai_val_dataset = rrnn.DatasetMultiTaskRRNN(convai_val_token,tasks,eval=False,deps=True)

# Datos md_gender

In [None]:
with open('Datasets\md_gender\md_complete.json','r',encoding="utf8") as f:
    md_data = json.load(f)

In [None]:
md_tokenized = generic.tokenize_dataset_with_dependencies_rrnn(md_data,['about','to','as'],vocab,word_to_index) 


md_dataset  = rrnn.DatasetMultiTaskRRNN(md_tokenized,tasks,eval=True,deps=True)

# Creación dataloaders

In [None]:
dl_train =  DataLoader(convai_train_dataset,batch_size=128,shuffle=True,collate_fn=rrnn.collate_fn_dep_multi)
dl_val = DataLoader(convai_val_dataset,batch_size=128,shuffle=True,collate_fn=rrnn.collate_fn_dep_multi)
dl_eval  = DataLoader(md_dataset,batch_size=128,shuffle=False,collate_fn=rrnn.collate_fn_dep_multi)

# MODELO

In [None]:
emb_dim = 100
vocab_size = len(word_to_index)
lstm_hidden_dim = 128

In [None]:
num_epochs = 100
learning_rate = 5e-5
global_metrics = {'about':{'recall':{'weighted_avg':[],'average':[],'female':[],'male':[]},
                            'precision':{'weighted_avg':[],'average':[],'female':[],'male':[]},
                            'f1':{'weighted_avg':[],'average':[],'female':[],'male':[]},
                            'acc':[]},
                    'to':{'recall':{'weighted_avg':[],'average':[],'female':[],'male':[]},
                            'precision':{'weighted_avg':[],'average':[],'female':[],'male':[]},
                            'f1':{'weighted_avg':[],'average':[],'female':[],'male':[]},
                            'acc':[]},
                    'as':{'recall':{'weighted_avg':[],'average':[],'female':[],'male':[]},
                            'precision':{'weighted_avg':[],'average':[],'female':[],'male':[]},
                            'f1':{'weighted_avg':[],'average':[],'female':[],'male':[]},
                            'acc':[]}}

In [None]:
model = rrnn.MultiTaskRRNNDep(emb_dim=emb_dim,dep_vocab=vocab,vocab_size=vocab_size,lstm_hidden_dim=lstm_hidden_dim).to(device)
save_path=f'm1_{model_name}_{encoding_type}_1'

optimizer = AdamW(model.parameters(), lr=learning_rate)
p, c, e = train.train_function_multi(model,num_epochs,dl_train,optimizer,early_stop = 10,dl_val = dl_val,save_path=save_path,es_threshold=0)
torch.save(p,save_path+'_probs'+'.pt')
torch.save(c,save_path+'_corr'+'.pt')

In [None]:
for task in ['about','as','to']:
    print(task.upper())
    train.get_datamap_complete_graph(p[task],correctness_vector=c[task],num_epochs=e,show_samples=True)

In [None]:
model = rrnn.MultiTaskRRNNDep(emb_dim=emb_dim,dep_vocab=vocab,vocab_size=vocab_size,lstm_hidden_dim=lstm_hidden_dim).to(device)

model.load_state_dict(torch.load(save_path))


metrics_results = eval_func_multi(model,dl_eval,['about','to','as'])
for task, task_metrics in metrics_results.items():
    print(f'Resultados en la tarea {task.upper()}:')
    for metric, value in task_metrics.items():
        if metric=='accuracy':
            global_metrics[task]['acc'].append(value) 
        else:

            for g,v in value.items():
                global_metrics[task][metric][g].append(v)
        print(metric,metrics_results[task][metric])
    print('\n')


In [None]:
model = rrnn.MultiTaskRRNNDep(emb_dim=emb_dim,dep_vocab=vocab,vocab_size=vocab_size,lstm_hidden_dim=lstm_hidden_dim).to(device)
save_path=f'm1_{model_name}_{encoding_type}_2'

optimizer = AdamW(model.parameters(), lr=learning_rate)
p, c, e = train.train_function_multi(model,num_epochs,dl_train,optimizer,early_stop = 10,dl_val = dl_val,save_path=save_path,es_threshold=0)
torch.save(p,save_path+'_probs'+'.pt')
torch.save(c,save_path+'_corr'+'.pt')

In [None]:
for task in ['about','as','to']:
    print(task.upper())
    train.get_datamap_complete_graph(p[task],correctness_vector=c[task],num_epochs=e,show_samples=True)

In [None]:
model = rrnn.MultiTaskRRNNDep(emb_dim=emb_dim,dep_vocab=vocab,vocab_size=vocab_size,lstm_hidden_dim=lstm_hidden_dim).to(device)

model.load_state_dict(torch.load(save_path))


metrics_results = eval_func_multi(model,dl_eval,['about','to','as'])
for task, task_metrics in metrics_results.items():
    print(f'Resultados en la tarea {task.upper()}:')
    for metric, value in task_metrics.items():
        if metric=='accuracy':
            global_metrics[task]['acc'].append(value) 
        else:

            for g,v in value.items():
                global_metrics[task][metric][g].append(v)
        print(metric,metrics_results[task][metric])
    print('\n')


In [None]:
model = rrnn.MultiTaskRRNNDep(emb_dim=emb_dim,dep_vocab=vocab,vocab_size=vocab_size,lstm_hidden_dim=lstm_hidden_dim).to(device)
save_path=f'm1_{model_name}_{encoding_type}_3'

optimizer = AdamW(model.parameters(), lr=learning_rate)
p, c, e = train.train_function_multi(model,num_epochs,dl_train,optimizer,early_stop = 10,dl_val = dl_val,save_path=save_path,es_threshold=0)
torch.save(p,save_path+'_probs'+'.pt')
torch.save(c,save_path+'_corr'+'.pt')

In [None]:
for task in ['about','as','to']:
    print(task.upper())
    train.get_datamap_complete_graph(p[task],correctness_vector=c[task],num_epochs=e,show_samples=True)

In [None]:
model = rrnn.MultiTaskRRNNDep(emb_dim=emb_dim,dep_vocab=vocab,vocab_size=vocab_size,lstm_hidden_dim=lstm_hidden_dim).to(device)

model.load_state_dict(torch.load(save_path))


metrics_results = eval_func_multi(model,dl_eval,['about','to','as'])
for task, task_metrics in metrics_results.items():
    print(f'Resultados en la tarea {task.upper()}:')
    for metric, value in task_metrics.items():
        if metric=='accuracy':
            global_metrics[task]['acc'].append(value) 
        else:

            for g,v in value.items():
                global_metrics[task][metric][g].append(v)
        print(metric,metrics_results[task][metric])
    print('\n')


In [None]:
for metric in ['f1','recall','precision']:
    print(f'{metric} medio de los 3 modelos: \n')
    for task in ['about','to','as']:
        print(task.upper())
        print(f'Resultado global {metric}:',mean(global_metrics[task][metric]['average']))
        print(f'Resultado global ponderado{metric}:',mean(global_metrics[task][metric]['weighted_avg']))
        print(f'{metric} etiqueta male:',mean(global_metrics[task][metric]['male']))
        print(f'{metric} etiqueta female: ',mean(global_metrics[task][metric]['female']))
        print('\n')

print(f'Accuracy medio de los 3 modelos: \n')
for task in ['about','to','as']:
    print('\n',task.upper())
    print('Resultado global accuracy:',mean(global_metrics[task]['acc']))