In [None]:
import os
import sys
cwd = os.getcwd()
module_path = os.path.abspath(os.path.join('..'))
sys.path.insert(0, os.path.join(cwd, "../"))

import pickle
import torch
import importlib
import numpy as np
import pandas as pd
import copy

import src.bin.tensorify as tensorify
import src.utils.data_conversion_utils as conversions
import src.data_manager.student_life_var_binned_data_manager as data_manager
import src.bin.trainer as trainer

from sklearn import metrics

from torch import nn
from src import definitions
from src.bin import validations
from src.bin import statistics
from src.bin import plotting
from src.utils.read_utils import read_pickle
from src.utils import student_utils
from src.data_manager import sub_sampler
from src.data_manager import cross_val
from src.data_manager import helper as data_manager_helper

from tabulate import tabulate 
import src.models.simple as simple_models   

pd.set_option('max_rows', 10000)

importlib.reload(validations)
importlib.reload(data_manager_helper)
importlib.reload(sub_sampler)
importlib.reload(statistics)
importlib.reload(cross_val)
importlib.reload(definitions)
importlib.reload(conversions)
importlib.reload(tensorify)
importlib.reload(plotting)
importlib.reload(trainer)
importlib.reload(data_manager)
importlib.reload(student_utils)
importlib.reload(simple_models)
feature_list = data_manager.FEATURE_LIST

##### Pickle ######
data = read_pickle('../data/training_data/most_representative_6_hr_b_imputed_normalized_prev_stress_data.pkl')

### Data ####

student_list = [53, 46, 7, 49, 22, 24, 2]
data = data_manager.get_data_for_training_in_dict_format(*student_list, normalize=True, 
                                                         fill_na=True, flatten_sequence=False)

# data = sub_sampler.get_sub_sampled_sequences(data)

# print(data['data'].keys())
# print(data['data']['2_4_16_4_1'][-1])


#### Randomize ######
train_ids, val_ids, test_ids = cross_val.random_stratified_splits(data)
data['train_ids'] = train_ids
data['val_ids'] = val_ids
data['test_ids'] = test_ids

############# Stats ########### 
# unnormalized_data = data_manager.get_data_for_training_in_dict_format(*student_list, normalize=False, fill_na=False)
# statistics_df, raw_df = statistics.get_statistics_on_data_dict(unnormalized_data, feature_list)
print(statistics.get_train_test_val_label_counts_from_raw_data(data))

In [None]:
################################## Init ##################################
hidden_size = 64
dropout = 0.85
learning_rate = 0.00005
n_epochs = 150
first_key = next(iter(data['data'].keys()))
num_features = len(data['data'][first_key][0][0])
covariates = len(data['data'][first_key][3])

In [None]:
data = tensorify.tensorify_data_gru_d(data)


val_f1_over_splits = []
test_f1_over_splits = []

for data, left_out_student in [(data, 1)]:
    
    print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
    print("Left Out Student: ", left_out_student)
    
    model = simple_models.SimpleLSTM(num_features=num_features,
                                     num_classes=3,
                                     hidden_size=hidden_size,
                                     bidirectional=True,
                                     dropout=dropout,
                                     covariates=covariates)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()
    
    loss_over_epochs, scores_over_epochs = plotting.get_empty_stat_over_n_epoch_dictionaries()

    best_val_f1 = 0
    best_test_f1 = 0

    for epoch in range(n_epochs):
        train_loss, train_labels, train_preds =  trainer.evaluate_set(data, 'train_ids', model, criterion, optimizer, 
                                                                      train_covariates=True)
        val_loss, val_labels, val_preds =  trainer.evaluate_set(data, 'val_ids', model, criterion, 
                                                                train_covariates=True)
        test_loss, test_labels, test_preds =  trainer.evaluate_set(data, 'test_ids', model, criterion, 
                                                                   train_covariates=True)

        loss_over_epochs['train_loss'].append(train_loss)
        loss_over_epochs['val_loss'].append(val_loss)
        loss_over_epochs['test_loss'].append(test_loss)

        train_scores = metrics.precision_recall_fscore_support(train_labels, train_preds, average='weighted')
        val_scores = metrics.precision_recall_fscore_support(val_labels, val_preds, average='weighted')
        test_scores = metrics.precision_recall_fscore_support(test_labels, test_preds, average='weighted')

        scores_over_epochs['train_scores'].append(train_scores)
        scores_over_epochs['val_scores'].append(val_scores)
        scores_over_epochs['test_scores'].append(test_scores)
        
        if epoch%10 == 0:
            print("xxxxxxxxxxxxxx epoch: {} xxxxxxxxxxxxxx".format(epoch))
            plotting.plot_score_over_n_epochs(scores_over_epochs, score_type='f1', fig_size=(8,5))
            plotting.plot_loss_over_n_epochs(loss_over_epochs, fig_size=(8, 5))
            print("Cofusion Matrix For Val Set: ")
            print(tabulate( metrics.confusion_matrix(val_labels, val_preds)))
            print("Predicted Label Distribution:")
            print(statistics.get_train_test_val_label_counts_from_predictions(train_preds, val_preds, test_preds))

        best_test_f1 = test_scores[2] if best_test_f1 < test_scores[2] else best_test_f1
        best_val_f1 = val_scores[2] if best_val_f1 < val_scores[2] else best_val_f1

    val_f1_over_splits.append([left_out_student, best_val_f1])
    test_f1_over_splits.append([left_out_student, best_test_f1])
    

In [None]:
print(val_f1_over_splits)
print(test_f1_over_splits)

In [None]:
# print(data['train_ids'])
train_ids = set(data['train_ids'])

# print(data['val_ids'])
val_ids = set(data['val_ids'])

# print(data['test_ids'])
test_ids = set(data['test_ids'])

print(train_ids.intersection(val_ids))
print(val_ids.intersection(test_ids))
print(train_ids.intersection(test_ids))