In [None]:
import os
import sys
cwd = os.getcwd()
module_path = os.path.abspath(os.path.join('..'))
sys.path.insert(0, os.path.join(cwd, "../"))

import pickle
import torch
import importlib
import numpy as np
import pandas as pd
import copy

import src.bin.tensorify as tensorify
import src.utils.data_conversion_utils as conversions
import src.data_manager.student_life_var_binned_data_manager as data_manager
import src.bin.trainer as trainer

from sklearn import metrics

from torch import nn
from copy import copy
from copy import deepcopy
from src import definitions
from src.bin import validations
from src.bin import statistics
from src.bin import plotting
from src.utils.read_utils import read_pickle
from src.utils import student_utils
from src.data_manager import sub_sampler
from src.data_manager import cross_val
from src.data_manager import helper as data_manager_helper
from tabulate import tabulate
from src.models import autoencoder
from src.models import multitask_learning
from src.models import user_dense_heads


pd.set_option('max_rows', 10000)

importlib.reload(validations)
importlib.reload(autoencoder)
importlib.reload(multitask_learning)
importlib.reload(user_dense_heads)
importlib.reload(sub_sampler)
importlib.reload(statistics)
importlib.reload(cross_val)
importlib.reload(definitions)
importlib.reload(conversions)
importlib.reload(tensorify)
importlib.reload(plotting)
importlib.reload(trainer)
importlib.reload(data_manager)
importlib.reload(student_utils)
importlib.reload(data_manager_helper)
feature_list = data_manager.FEATURE_LIST

# ##### Pickle #####
# data = read_pickle('../data/training_data/most_representative_6_hr_b_imputed_normalized_prev_stress_data.pkl')

#### Data ####
student_list = [53, 46, 7, 49, 22, 24, 2]
# student_list = [2]
data = data_manager.get_data_for_training_in_dict_format(*student_list, normalize=True, 
                                                         fill_na=True, flatten_sequence=False)

# ###### Randomize ######
# train_ids, val_ids, test_ids = cross_val.random_stratified_splits(data)
# data['train_ids'] = train_ids
# data['val_ids'] = val_ids
# data['test_ids'] = test_ids

# data = sub_sampler.get_sub_sampled_sequences(data)

############# Stats ############# 
# unnormalized_data = data_manager.get_data_for_training_in_dict_format(*student_list, normalize=False, fill_na=False)
# statistics_df, raw_df = statistics.get_statistics_on_data_dict(unnormalized_data, feature_list)
# print(statistics.get_train_test_val_label_counts_from_raw_data(data))


#### Data ####
student_list = [53, 46, 7, 49, 22, 24, 2]
# student_list = [2]
data = data_manager.get_data_for_training_in_dict_format(*student_list, normalize=True, 
                                                         fill_na=True, flatten_sequence=False)

###### Randomize ######
train_ids, val_ids, test_ids = cross_val.random_stratified_splits(data, stratify_by='students')
data['train_ids'] = train_ids
data['val_ids'] = val_ids
data['test_ids'] = test_ids

# data = sub_sampler.get_sub_sampled_sequences(data)

############ Stats ############# 
# unnormalized_data = data_manager.get_data_for_training_in_dict_format(*student_list, normalize=False, fill_na=False)
# statistics_df, raw_df = statistics.get_statistics_on_data_dict(unnormalized_data, feature_list)
print(statistics.get_train_test_val_label_counts_from_raw_data(data))

In [None]:
################################## Init ##################################
autencoder_bottle_neck_feature_size = 128
autoencoder_num_layers = 1

first_key = next(iter(data['data'].keys()))
num_features = len(data['data'][first_key][0][0])
num_covariates = len(data['data'][first_key][3])
shared_hidden_layer_size = 256
user_dense_layer_hidden_size = 128
num_classes = 3

class_weights = torch.tensor(statistics.get_class_weights_in_inverse_proportion(data))
data = tensorify.tensorify_data_gru_d(data)
dropout = 0.85
learning_rate = 0.00005
n_epochs = 1

model = multitask_learning.MultiTaskLearner(
                               conversions.prepend_ids_with_string(student_list, "student_"),
                               num_features,
                               autencoder_bottle_neck_feature_size,
                               autoencoder_num_layers,
                               shared_hidden_layer_size,
                               user_dense_layer_hidden_size,
                               num_classes,
                               num_covariates)


reconstruction_criterion = torch.nn.L1Loss(reduction="sum")
classification_criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
total_loss_over_epochs, scores_over_epochs = plotting.get_empty_stat_over_n_epoch_dictionaries()
reconstruction_loss_over_epochs = deepcopy(total_loss_over_epochs)
classification_loss_over_epochs = deepcopy(total_loss_over_epochs)

for epoch in range(n_epochs):
    
    (train_total_loss, train_total_reconstruction_loss , train_total_classification_loss, 
     train_labels, train_preds) = trainer.evaluate_multitask_learner(data, 
                                                                      'train_ids',
                                                                      model,
                                                                      reconstruction_criterion ,
                                                                      classification_criterion,
                                                                      optimizer=optimizer,
                                                                      alpha=0.2,
                                                                      beta=0.8)
    
    (val_total_loss, val_total_reconstruction_loss , val_total_classification_loss, 
     val_labels, val_preds) =  trainer.evaluate_multitask_learner(data,
                                                                  'val_ids',
                                                                  model,
                                                                  reconstruction_criterion,
                                                                  classification_criterion,
                                                                  alpha=0.2,
                                                                  beta=0.8)
    
    (test_total_loss, test_total_reconstruction_loss , test_total_classification_loss, 
     test_labels, test_preds) =  trainer.evaluate_multitask_learner(data, 
                                                                    'test_ids',
                                                                    model,
                                                                    reconstruction_criterion,
                                                                    classification_criterion,
                                                                    alpha=0.2,
                                                                    beta=0.8)
    
    ######## Appending losses ######## 
    total_loss_over_epochs['train_loss'].append(train_total_loss)
    total_loss_over_epochs['val_loss'].append(val_total_loss)
    total_loss_over_epochs['test_loss'].append(test_total_loss)
    
    reconstruction_loss_over_epochs['train_loss'].append(train_total_reconstruction_loss)
    reconstruction_loss_over_epochs['val_loss'].append(val_total_reconstruction_loss)
    reconstruction_loss_over_epochs['test_loss'].append(test_total_reconstruction_loss)
    
    classification_loss_over_epochs['train_loss'].append(train_total_classification_loss)
    classification_loss_over_epochs['val_loss'].append(val_total_classification_loss)
    classification_loss_over_epochs['test_loss'].append(test_total_classification_loss)
    
    print(train_total_loss)
    
    ######## Appending Metrics ######## 
    train_scores = metrics.precision_recall_fscore_support(train_labels, train_preds, average='weighted')
    val_scores = metrics.precision_recall_fscore_support(val_labels, val_preds, average='weighted')
    test_scores = metrics.precision_recall_fscore_support(test_labels, test_preds, average='weighted')

    scores_over_epochs['train_scores'].append(train_scores)
    scores_over_epochs['val_scores'].append(val_scores)
    scores_over_epochs['test_scores'].append(test_scores)

    if epoch%1 == 0:
        print("xxxxxxxxxxxxxx epoch: {} xxxxxxxxxxxxxx".format(epoch))
        plotting.plot_score_over_n_epochs(scores_over_epochs, score_type='f1', fig_size=(8,5))
        plotting.plot_loss_over_n_epochs(total_loss_over_epochs, fig_size=(8, 5))
        plotting.plot_loss_over_n_epochs(reconstruction_loss_over_epochs, fig_size=(8, 5))
        plotting.plot_loss_over_n_epochs(classification_loss_over_epochs, fig_size=(8, 5))
        
        print("Cofusion Matrix For Val Set: ")
        print(tabulate( metrics.confusion_matrix(val_labels, val_preds)))
        print("Predicted Label Distribution:")
        print(statistics.get_train_test_val_label_counts_from_predictions(train_preds, val_preds, test_preds))
