In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from datetime import datetime

repo_dir = os.getcwd() + '/repo'
tft_dir = os.path.join(repo_dir, 'tft')
os.chdir(tft_dir)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
!ls

 carriage_services.csv		     'outputs_mobiact(v2)(undersampling)'
 data_formatters		     'outputs_mobiact(v2)_wo_pinfo'
 data_formatters.zip		      outputs_mobiact_wo_age
 dlr_preprocessed		      outputs_mobiact_wo_gender
 dlr_tft_results		      outputs_mobiact_wo_known
 expt_settings			      outputs_mobiact_wo_known_no_bioinfo
 expt_settings.zip		      outputs_mobiact_wo_known_no_bioinfo2
 libs				      outputs_notchFall
 libs.zip			     'outputs_notchFall(oversampling)'
 mobiact_dataset		      outputs_notchFall_swa
 MobiAct_Dataset_v2.0.rar	     'outputs_notchFall(undersampling)'
 MobiAct_Dataset_v2-Copy1.0.rar       outputs_pm2.5
 mobiact_preprocessed		      outputs_smartFall
 MobiAct_preprocessed.zip	     'outputs_smartFall(oversampling)'
 mobi_tft_wo_known_no_bioinfo	      outputs_smartFall_swa
 mobi_tft_wo_known_results	     'outputs_smartFall(undersampling)'
 notch_dataset			      outputs_stock
 output_folder			      outputs_stock_v2
 outputs			      outputs_st

In [3]:
output_folder = os.path.join(os.getcwd(), 'outputs_mobiact_wo_known_no_bioinfo2')   # Root folder to save experiment outputs
force_download = False 
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [4]:
file = open('mobiact_dataset/Readme.txt', 'r', encoding='latin1')
strings = file.readlines()
file.close()
for s in strings:
    print(s)



The NEW version of the MobiAct dataset includes:

 	Four different types of falls performed by 66 participants

 	Eleven different types of ADLs performed by 19 participants and nine types of ADLs performed by 59 participants (plus one activity "LYI" which results from the inactivity period after a fall by 66 participants)

 	Five sub-scenarios which construct one scenario of daily living, which consists of a sequence of 50 activities and performed by 19 participants.



The new released version of the MobiAct dataset includes:

  The raw recorded data in txt format, separated by each activity

  The annotated data in csv format, separated by each activity





Filename format:

<ADL OR FALL OR SCENARIO_CODE>_<SENSOR_CODE>_<SUBJECT_ID>_<TRIAL_NO>.txt



examples:

1 -->	WAL_acc_5_1.txt

2 -->	STD_ori_9_5.txt

3 -->	FKL_gyro_3_2.txt

4 -->	SRH_acc_1_1.txt





Subjects:

+------+---------+-----------+-------+----------+----------+----------+

|  ID  |  Name   |  Surname  |  Age  

In [5]:
person_list = []
for s in strings:
    if 'sub' in s and '|' in s:
        temp = s.split('|')
        temp = [x.strip() for x in temp]
        if len(temp) == 9:
            person_list.append(temp[3:-1])

activity_list = []
for s in strings:
    if '|' in s:
        temp = s.split('|')
        temp = [x.strip() for x in temp]
        if len(temp) == 8:
            activity_list.append(temp[1:-1])

In [6]:
falls = ['FOL', 'FKL', 'BSC', 'SDL']

columns = ['name', 'age', 'height', 'weight', 'gender']
person_info = pd.DataFrame(person_list, columns=columns)

activity_info = pd.DataFrame(activity_list)
activity_info.columns = activity_info.iloc[0]
activity_info = activity_info.drop(0)
activity_info = activity_info.drop(13)
activity_info = activity_info.reset_index(drop=True)
# print(activity_info.columns)
index = activity_info['No.']
activity_info = activity_info.drop(['No.'], axis=1)
activity_info.index = index
activity_info['label_encoded'] = list(range(len(activity_info)))

# Data preprocessing

In [7]:
data_dir = 'mobiact_dataset/'
file_list = os.listdir(data_dir)
file_list = [file for file in file_list if file.endswith('.csv')]
file_list[:3]

['FKL_20_3_annotated.csv', 'FKL_17_3_annotated.csv', 'SCH_22_6_annotated.csv']

In [8]:
train = pd.read_csv('mobiact_preprocessed/train.csv', index_col=0)
valid = pd.read_csv('mobiact_preprocessed/valid.csv', index_col=0)
test = pd.read_csv('mobiact_preprocessed/test.csv', index_col=0)

In [9]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(activity_info['Label'])

train_encoded = encoder.transform(train['label'])
train['label_encoded'] = train_encoded

valid_encoded = encoder.transform(valid['label'])
valid['label_encoded'] = valid_encoded

test_encoded = encoder.transform(test['label'])
test['label_encoded'] = test_encoded

In [10]:
train['known'] = 0
valid['known'] = 0
test['known'] = 0

train['circum'] = 0
valid['circum'] = 0
test['circum'] = 0

In [11]:
train.head()

Unnamed: 0,timestamp,rel_time,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,...,label,person_id,person_age,person_height,person_weight,person_gender,trial_num,label_encoded,known,circum
0,4122885402000,0.0,3.238091,8.132329,4.245767,0.04948,-0.007636,-0.013744,273.7367,-67.5933,...,SIT,6,22,172,62,F,4,11,0,0
1,4122890370000,0.004968,3.256741,8.139323,4.259755,0.045204,-0.008552,-0.00336,274.05545,-67.167404,...,SIT,6,22,172,62,F,4,11,0,0
2,4122895406000,0.010004,3.275647,8.146413,4.273935,0.040012,-0.004887,-0.009163,274.36743,-66.75942,...,SIT,6,22,172,62,F,4,11,0,0
3,4122900447000,0.015045,3.294929,8.154904,4.283354,0.032681,-0.011301,-0.018631,274.72174,-66.325455,...,SIT,6,22,172,62,F,4,11,0,0
4,4122905384000,0.019982,3.314145,8.164513,4.288158,0.029322,-0.002443,-0.014966,275.06073,-65.9031,...,SIT,6,22,172,62,F,4,11,0,0


# Data Formatter

In [12]:
from data_formatters.base import GenericDataFormatter, DataTypes, InputTypes

# View avialable inputs and data types.
print("Available data types:")
for option in DataTypes:
    print(option)

print()
print("Avaialbe input types:")
for option in InputTypes:
    print(option)

Available data types:
DataTypes.REAL_VALUED
DataTypes.CATEGORICAL
DataTypes.DATE

Avaialbe input types:
InputTypes.TARGET
InputTypes.OBSERVED_INPUT
InputTypes.KNOWN_INPUT
InputTypes.STATIC_INPUT
InputTypes.ID
InputTypes.TIME


In [13]:
from libs import utils 
import sklearn.preprocessing
class MobiActFormatter(GenericDataFormatter):
    _column_definition = [
        ('person_id', DataTypes.CATEGORICAL, InputTypes.ID),
        ('rel_time', DataTypes.REAL_VALUED, InputTypes.TIME),
        ('known', DataTypes.CATEGORICAL, InputTypes.KNOWN_INPUT),
        ('circum', DataTypes.CATEGORICAL, InputTypes.STATIC_INPUT),
        ('acc_x', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
        ('acc_y', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
        ('acc_z', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
        ('gyro_x', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
        ('gyro_y', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
        ('gyro_z', DataTypes.REAL_VALUED, InputTypes.OBSERVED_INPUT),
        ('label_encoded', DataTypes.REAL_VALUED, InputTypes.TARGET)
    ]
    
    def __init__(self):
        """initializes formatter"""
        
        self.identifiers = None
        self._real_scalers = None
        self._cat_scalers = None
        self._target_scaler = None
        self._num_classes_per_cat_input = None
    
    def split_data(self, train, valid, test):
        """
        split data frame into training-validation-test data frames
        
        """
        print('Formatting train-valid-test splits.')
        
        train_data = train

        val_data = valid
        
        test_data = test
        
        self.set_scalers(train_data)
        
        return (self.transform_inputs(data) for data in [train_data, val_data, test_data])
    
    def set_scalers(self, df):
        """
        Calibrates scalers using the data supplied.
        (제공된 데이터를 사용해서 scaler 교정)

        Args:
          df: Data to use to calibrate scalers.
        """
        print('Setting scalers with training data...')
        
        column_definitions = self.get_column_definition()

        id_column = utils.get_single_col_by_input_type(InputTypes.ID,
                                                       column_definitions)
        target_column = utils.get_single_col_by_input_type(InputTypes.TARGET,
                                                           column_definitions)
        # extract identifiers in case required
        self.identifiers = list(df[id_column].unique())
        
        # Format real scalers
        real_inputs = utils.extract_cols_from_data_type(
            DataTypes.REAL_VALUED, column_definitions,
            {InputTypes.ID, InputTypes.TIME})
        
        data = df[real_inputs].values
        self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data)
        
        # target데이터를 prediction에 사용
        self._target_scaler = sklearn.preprocessing.StandardScaler().fit(df[[target_column]].values)
        
        categorical_inputs = utils.extract_cols_from_data_type(
            DataTypes.CATEGORICAL, column_definitions,
            {InputTypes.ID, InputTypes.TIME})
        
        categorical_scalers = {}
        num_classes = []
        for col in categorical_inputs:
            srs = df[col].apply(str)
            categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
            num_classes.append(srs.nunique())
        
        # set categorical scaler outputs
        self._cat_scalers = categorical_scalers
        self._num_classes_per_cat_input = num_classes
        
    def transform_inputs(self, df):
        """
        performs feature transformations.
        
        feature engineering, preprocessing and normalization을 포함
        
        Args:
            df - data frame to transform.
            
        Returns:
            Transformed data frame.
        """
        output = df.copy()
        
        if self._real_scalers is None and self._cat_scalers is None:
            raise ValueError('Scalers have not been set!')
        
        column_definitions = self.get_column_definition()
        real_inputs = utils.extract_cols_from_data_type(
            DataTypes.REAL_VALUED, column_definitions,
            {InputTypes.ID, InputTypes.TIME}
        )
        categorical_inputs = utils.extract_cols_from_data_type(
            DataTypes.CATEGORICAL, column_definitions,
            {InputTypes.ID, InputTypes.TIME})
        output[real_inputs] = self._real_scalers.transform(df[real_inputs].values)
        for col in categorical_inputs:
            string_df = df[col].apply(str)
            output[col] = self._cat_scalers[col].transform(string_df)
        return output
    
    def format_predictions(self, predictions):
        output = predictions.copy()
        
        column_names = predictions.columns
        for col in column_names:
            if col not in {'forecast_time', 'identifier'}:
                output[col] = self._target_scaler.inverse_transform(predictions[col])
                
        return output
    
    def get_fixed_params(self):
        fixed_params = {
            'total_time_steps': 87,     # Total width of the Temporal Fusion Decoder
            'num_encoder_steps': 43,    # Length of LSTM decoder (ie. # historical inputs)
            'num_epochs': 100,            # Max number of epochs for training
            'early_stopping_patience': 5, # Early stopping threshold for # iterations with no loss improvement
            'multiprocessing_workers': 5  # Number of multi-processing workers
        }
        
        return fixed_params
    
    def get_default_model_params(self):
        model_params = {
            'dropout_rate': 0.3,
            'hidden_layer_size': 160,
            'learning_rate': 0.01,
            'minibatch_size': 64,
            'max_gradient_norm': 0.01,
            'num_heads': 4,
            'stack_size': 1
        }

        return model_params

In [14]:
data_formatter = MobiActFormatter()
train, valid, test = data_formatter.split_data(train, valid, test)
train_samples, valid_samples = data_formatter.get_num_samples_for_calibration(
  )

Formatting train-valid-test splits.
Setting scalers with training data...


# Model

In [15]:
import libs.hyperparam_opt
import libs.tft_model
import libs.utils as util

ModelClass = libs.tft_model.TemporalFusionTransformer
data_params = data_formatter.get_experiment_params()
param_ranges = ModelClass.get_hyperparm_choices()

In [16]:
model_params = {'dropout_rate': 0.3,      # Dropout discard rate
                'hidden_layer_size': 320, # Internal state size of TFT
                'learning_rate': 0.001,   # ADAM initial learning rate
                'minibatch_size': 512,    # Minibatch size for training
                'max_gradient_norm': 100.,# Max norm for gradient clipping
                'num_heads': 4,           # Number of heads for multi-head attention
                'stack_size': 1           # Number of stacks (default 1 for interpretability)
               }

In [19]:
model_folder = os.path.join(output_folder, 'saved_models', 'MobiAct_wo_known_no_bioinfo2', 'fixed')
model_params['model_folder'] = model_folder

model_params.update(data_params)

In [18]:
HyperparamOptManager = libs.hyperparam_opt.HyperparamOptManager
print("*** Loading hyperparm manager ***")
opt_manager = HyperparamOptManager(param_ranges, data_params, model_folder)

*** Loading hyperparm manager ***


In [19]:
success = opt_manager.load_results()
restart_opt = None
if success and not restart_opt:
    print("Loaded results from previous training")
else:
    print("Creating new hyperparameter optimisation")
    opt_manager.clear()

Loading results from /workspace/inkyung/TFT/repo/tft/outputs_mobiact_wo_known_no_bioinfo2/saved_models/MobiAct_wo_known_no_bioinfo/fixed
Loaded results from previous training


In [22]:
import tensorflow as tf
from libs.tft_model import TemporalFusionTransformer

# Specify GPU usage
tf_config = utils.get_default_tensorflow_config(tf_device="gpu", gpu_id='3')

Selecting GPU ID=3


# Training

In [21]:
tf.compat.v1.get_default_graph()
with tf.Graph().as_default(), tf.compat.v1.Session(config=tf_config) as sess:
    tf.compat.v1.keras.backend.set_session(sess)
    params = opt_manager.get_next_parameters()
    # Create a TFT model
    model = TemporalFusionTransformer(model_params,
                                    use_cudnn=True) # Run model on GPU using CuDNNLSTM cells
    
    # Sample data into minibatches for training
    if not model.training_data_cached():
        model.cache_batched_data(train, "train", num_samples=train_samples)
        model.cache_batched_data(valid, "valid", num_samples=valid_samples)

    # Train and save model
    model.fit()
    
    val_loss = model.evaluate()
    if np.allclose(val_loss, 0.) or np.isnan(val_loss):
        # Set all invalid losses to infintiy.
        # N.b. val_loss only becomes 0. when the weights are nan.
        print("Skipping bad configuration....")
        val_loss = np.inf
    opt_manager.update_score(params, val_loss, model)
    tf.compat.v1.keras.backend.set_session(sess)
    model.save(model_folder)

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla V100-DGXS-32GB, pci bus id: 0000:0f:00.0, compute capability: 7.0

Resetting temp folder...
*** TemporalFusionTransformer params ***
# dropout_rate = 0.3
# hidden_layer_size = 320
# learning_rate = 0.001
# minibatch_size = 512
# max_gradient_norm = 100.0
# num_heads = 4
# stack_size = 1
# model_folder = /workspace/inkyung/TFT/repo/tft/outputs_mobiact_wo_known_no_bioinfo2/saved_models/MobiAct_wo_known_no_bioinfo/fixed
# total_time_steps = 87
# num_encoder_steps = 43
# num_epochs = 100
# early_stopping_patience = 5
# multiprocessing_workers = 5
# column_definition = [('person_id', <DataTypes.REAL_VALUED: 0>, <InputTypes.ID: 4>), ('rel_time', <DataTypes.DATE: 2>, <InputTypes.TIME: 5>), ('known', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>), ('a

Cached data "train" updated
Cached data "valid" updated
*** Fitting TemporalFusionTransformer ***
Getting batched_data
Using cached training data
Using cached validation data
Using keras standard fit
Train on 11188446 samples, validate on 834023 samples
Epoch 1/100
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Cannot load from /workspace/inkyung/TFT/repo/tft/outputs_mobiact_wo_known_no_bioinfo2/saved_models/MobiAct_wo_known_no_bioinfo/fixed/tmp, skipping ...
Using cached validation data
Optimal model found, updating

Model saved to: /workspace/inkyung/TFT/repo/tft/outputs_mobiact_wo_known_no_bioinfo2/saved_models/MobiAct_wo_known_no_bioinfo/fixed/TemporalFusionTransformer.ckpt
Model saved to: /workspace/inkyung/TFT/repo/tft/outputs_mobiact_wo_known_no_bioinfo2/saved_models/MobiAct_wo_known_no_bioinfo/fixed/TemporalFusionTransformer.c

# Evalution 

In [23]:
tf.compat.v1.reset_default_graph()
with tf.Graph().as_default(), tf.compat.v1.Session(config=tf_config) as sess:
    tf.compat.v1.keras.backend.set_session(sess)
    # Create a new model & load weights
    model = TemporalFusionTransformer(model_params,
                                      use_cudnn=True)
    model.load(model_folder)
    
    # Make forecasts
    output_map = model.predict(test, return_targets=True)
    
    targets = data_formatter.format_predictions(output_map["targets"])
    # Format predictions
    
    p50_forecast = data_formatter.format_predictions(output_map["p50"])
    p90_forecast = data_formatter.format_predictions(output_map["p90"])

    def extract_numerical_data(data):
        """Strips out forecast time and identifier columns."""
        return data[[
            col for col in data.columns
            if col not in {"forecast_time", "identifier"}
        ]]
    # Compute Losses
    p50_loss = utils.numpy_normalised_quantile_loss(
        extract_numerical_data(targets), extract_numerical_data(p50_forecast),
        0.5)
    p90_loss = utils.numpy_normalised_quantile_loss(
        extract_numerical_data(targets), extract_numerical_data(p90_forecast),
        0.9)

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla V100-DGXS-32GB, pci bus id: 0000:0f:00.0, compute capability: 7.0

Resetting temp folder...
*** TemporalFusionTransformer params ***
# dropout_rate = 0.3
# hidden_layer_size = 320
# learning_rate = 0.001
# minibatch_size = 512
# max_gradient_norm = 100.0
# num_heads = 4
# stack_size = 1
# model_folder = /workspace/inkyung/TFT/repo/tft/outputs_mobiact_wo_known_no_bioinfo2/saved_models/MobiAct_wo_known_no_bioinfo2/fixed
# total_time_steps = 87
# num_encoder_steps = 43
# num_epochs = 100
# early_stopping_patience = 5
# multiprocessing_workers = 5
# column_definition = [('person_id', <DataTypes.CATEGORICAL: 1>, <InputTypes.ID: 4>), ('rel_time', <DataTypes.REAL_VALUED: 0>, <InputTypes.TIME: 5>), ('acc_x', <DataTypes.REAL_VALUED: 0>, <InputTypes.OBSERVED_INPU


Loading model from /workspace/inkyung/TFT/repo/tft/outputs_mobiact_wo_known_no_bioinfo2/saved_models/MobiAct_wo_known_no_bioinfo2/fixed/TemporalFusionTransformer.ckpt
Unsuccessful TensorSliceReader constructor: Failed to find any matching files for /workspace/inkyung/TFT/repo/tft/outputs_mobiact_wo_known_no_bioinfo2/saved_models/MobiAct_wo_known_no_bioinfo2/fixed/TemporalFusionTransformer.ckpt


ValueError: The passed save_path is not a valid checkpoint: /workspace/inkyung/TFT/repo/tft/outputs_mobiact_wo_known_no_bioinfo2/saved_models/MobiAct_wo_known_no_bioinfo2/fixed/TemporalFusionTransformer.ckpt

In [None]:
import datetime as dte
print("Hyperparam optimisation completed @ {}".format(dte.datetime.now()))
# print("Best validation loss = {}".format(val_loss))
# print("Params:")

# for k in best_params:
#     print(k, " = ", best_params[k])
#     print()
print("Normalised quantile losses: P50={}, P90={}".format(p50_loss.mean(), p90_loss.mean()))

In [22]:
import itertools
def plot_confusion_matrix(cm, target_names=None, cmap=None, normalize=True, labels=True, title='Confusion matrix'):
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
    plt.figure(figsize=(16, 12))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
#     plt.title(title)
    plt.colorbar()

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    
    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, )
        plt.yticks(tick_marks, target_names)
    
    if labels:
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            if normalize:
                plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                         horizontalalignment="center",
                         color="white" if cm[i, j] > thresh else "black",
                         fontsize='xx-large')
            else:
                plt.text(j, i, "{:,}".format(cm[i, j]),
                         horizontalalignment="center",
                         color="white" if cm[i, j] > thresh else "black",
                         fontsize='xx-large')

    plt.tight_layout()
    plt.ylabel('True label', fontsize=30)
    plt.xlabel('Predicted label\naccuracy={:0.2f}; misclass={:0.2f}'.format(accuracy, misclass), fontsize=30)
    plt.show()

In [23]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import classification_report

def getScores(forecast, target):
    predicted, transformed_target = list(), list()

    for i in range(len(forecast)):
        value = forecast.iloc[i].values
        for v in value:
            if v < 16.5 and v >= 15.5:
                predicted.append(16.0)
            elif v < 15.5 and v >= 14.5:
                predicted.append(15.0)
            elif v < 14.5 and v >= 13.5:
                predicted.append(14.0)
            elif v < 13.5 and v >= 12.5:
                predicted.append(13.0)
            elif v < 12.5 and v >= 11.5:
                predicted.append(12.0)
            elif v < 11.5 and v >= 10.5:
                predicted.append(11.0)
            elif v < 10.5 and v >= 9.5:
                predicted.append(10.0)
            elif v < 9.5 and v >= 8.5:
                predicted.append(9.0)
            elif v < 8.5 and v >= 7.5:
                predicted.append(8.0)
            elif v < 7.5 and v >= 6.5:
                predicted.append(7.0)
            elif v < 6.5 and v >= 5.5:
                predicted.append(6.0)
            elif v < 5.5 and v >= 4.5:
                predicted.append(5.0)
            elif v < 4.5 and v >= 3.5:
                predicted.append(4.0)
            elif v < 3.5 and v >= 2.5:
                predicted.append(3.0)
            elif v < 2.5 and v >= 1.5:
                predicted.append(2.0)
            elif v < 1.5 and v >= 0.5:
                predicted.append(1.0)
            elif v < 0.5 and v >= -.5:
                predicted.append(0.0)
                
    for i in range(len(target)):
        value = target.iloc[i].values
        for v in value:
            if v < 16.5 and v >= 15.5:
                transformed_target.append(16.0)
            elif v < 15.5 and v >= 14.5:
                transformed_target.append(15.0)
            elif v < 14.5 and v >= 13.5:
                transformed_target.append(14.0)
            elif v < 13.5 and v >= 12.5:
                transformed_target.append(13.0)
            elif v < 12.5 and v >= 11.5:
                transformed_target.append(12.0)
            elif v < 11.5 and v >= 10.5:
                transformed_target.append(11.0)
            elif v < 10.5 and v >= 9.5:
                transformed_target.append(10.0)
            elif v < 9.5 and v >= 8.5:
                transformed_target.append(9.0)
            elif v < 8.5 and v >= 7.5:
                transformed_target.append(8.0)
            elif v < 7.5 and v >= 6.5:
                transformed_target.append(7.0)
            elif v < 6.5 and v >= 5.5:
                transformed_target.append(6.0)
            elif v < 5.5 and v >= 4.5:
                transformed_target.append(5.0)
            elif v < 4.5 and v >= 3.5:
                transformed_target.append(4.0)
            elif v < 3.5 and v >= 2.5:
                transformed_target.append(3.0)
            elif v < 2.5 and v >= 1.5:
                transformed_target.append(2.0)
            elif v < 1.5 and v >= 0.5:
                transformed_target.append(1.0)
            elif v < 0.5 and v >= -.5:
                transformed_target.append(0.0)
            else:
                print(v)
    
    recall = recall_score(transformed_target, predicted, average='macro')
    precision = precision_score(transformed_target, predicted, average='macro')
    f1 = f1_score(transformed_target, predicted, average='macro')
    acc = accuracy_score(transformed_target, predicted)
    print(classification_report(transformed_target, predicted))
    return acc, precision, recall, f1

In [24]:
print(p50_forecast['identifier'].unique())
p50_forecast.head()

[54 55 56 57 58 59 60 61 62 63 64 65 66 67]


Unnamed: 0,forecast_time,identifier,t+0,t+1,t+2,t+3,t+4,t+5,t+6,t+7,...,t+34,t+35,t+36,t+37,t+38,t+39,t+40,t+41,t+42,t+43
0,0.20999,54,11.999058,11.999049,11.999048,11.999082,11.999079,11.999078,11.999076,11.999075,...,11.999051,11.999051,11.99905,11.99905,11.99905,11.99905,11.999049,11.999049,11.999049,11.999049
1,0.215368,54,11.999058,11.999049,11.999048,11.999082,11.999079,11.999078,11.999076,11.999075,...,11.999051,11.999051,11.99905,11.99905,11.99905,11.99905,11.999049,11.999049,11.999049,11.999049
2,0.220053,54,11.999058,11.999049,11.999048,11.999082,11.999079,11.999078,11.999076,11.999075,...,11.999051,11.999051,11.99905,11.99905,11.99905,11.99905,11.999049,11.999049,11.999049,11.999049
3,0.225053,54,11.999058,11.999049,11.999048,11.999082,11.999079,11.999078,11.999076,11.999075,...,11.999051,11.999051,11.99905,11.99905,11.99905,11.99905,11.999049,11.999049,11.999049,11.999049
4,0.230049,54,11.999059,11.999049,11.999048,11.999082,11.999079,11.999078,11.999076,11.999075,...,11.999051,11.999051,11.99905,11.99905,11.99905,11.99905,11.999049,11.999049,11.999049,11.999049


In [25]:
test_per_list = p50_forecast['identifier'].unique()
save_dir = 'mobi_tft_wo_known_no_bioinfo/'
total_acc, total_precision, total_recall, total_f1 = 0., 0., 0., 0.

for num in test_per_list:
    forecast = p50_forecast[p50_forecast['identifier'] == num]
    target = targets[targets['identifier'] == num]

    forecast = forecast.drop(['forecast_time', 'identifier'], axis=1)
    target = target.drop(['forecast_time', 'identifier'], axis=1)
    
    acc, precision, recall, f1 = getScores(forecast, target)
    
    result = pd.DataFrame([acc, precision, recall, f1])
    result = result.T
    result.columns = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    result.to_csv(save_dir + str(num) + 'result.csv')
    
    total_acc += acc
    total_precision += precision
    total_recall += recall
    total_f1 += f1
    
    print('Done ', num)

              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95     64900
         2.0       0.97      0.97      0.97    172700
         3.0       0.97      0.97      0.97    190476
         4.0       0.95      0.95      0.95     62612
         5.0       0.95      0.95      0.95     55044
         6.0       1.00      1.00      1.00    766392
         7.0       1.00      1.00      1.00    767580
         8.0       0.98      0.98      0.98    695992
         9.0       0.95      0.95      0.95    108900
        10.0       0.95      0.95      0.95     58828
        11.0       0.93      0.93      0.93    273108
        12.0       0.98      0.98      0.98   3142700
        13.0       0.98      0.98      0.98    363836
        14.0       0.99      0.99      0.99    420464
        15.0       1.00      1.00      1.00   2626668

    accuracy                           0.99   9770200
   macro avg       0.97      0.97      0.97   9770200
weighted avg       0.99   

              precision    recall  f1-score   support

         0.0       0.94      0.94      0.94     52888
         1.0       0.93      0.93      0.93    104500
         2.0       0.96      0.96      0.96    214544
         3.0       0.96      0.96      0.96    236808
         4.0       0.94      0.94      0.94     51524
         5.0       0.93      0.93      0.93     42196
         6.0       1.00      1.00      1.00    993036
         7.0       1.00      1.00      1.00    880440
         8.0       0.98      0.98      0.98    740124
         9.0       0.94      0.94      0.94    132748
        10.0       0.94      0.94      0.94     52360
        11.0       0.99      0.99      0.99   2262678
        12.0       0.99      0.99      0.99   4940210
        13.0       0.99      0.99      0.99    549384
        14.0       0.99      0.99      0.99    595496
        15.0       1.00      1.00      1.00   5310272

    accuracy                           0.99  17159208
   macro avg       0.97   

In [26]:
print('Average Accuracy: ', total_acc / len(test_per_list))
print('Average Precision: ', total_precision / len(test_per_list))
print('Average Recall: ', total_recall / len(test_per_list))
print('Average F1 Score: ', total_f1 / len(test_per_list))

Average Accuracy:  0.9883476832222431
Average Precision:  0.9690480408325184
Average Recall:  0.9690590630089096
Average F1 Score:  0.969053548255309
