In [1]:
import numpy as np
import pandas as pd
import h5py
import scipy
import math
import re
import json
import os


import tensorflow as tf
from tensorflow.keras.callbacks import (ModelCheckpoint, TensorBoard, ReduceLROnPlateau,
                                        CSVLogger, EarlyStopping, TerminateOnNaN)
from csv import writer
from datetime import datetime

from load_dataset import create_dataset, read_and_random_undersampling_dataset, hdf5_file_dataset
from callback_save_files import TrainingCallback, FitCallback
import local_settings

WORKING_DIRECTORY = local_settings.WORKING_DIRECTORY #'/home/buliabog/Diploma/'
logs_dir = f'{WORKING_DIRECTORY}Data/logs/'
file_name = '20240406 Calc Scikit-learn clasifiers'
task_name = ''

logs_directory = os.path.join(logs_dir, file_name)
metrics_file = os.path.join(logs_directory, 'f1.csv')


val_filepath = f'{WORKING_DIRECTORY}Data/CODE-15/val'
train_filepath = f'{WORKING_DIRECTORY}Data/CODE-15/train'

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve

clasifiers = [#{'name': "AdaBoost"
              #  , 'clasifier': AdaBoostClassifier(random_state=42)
              #  , 'type': 'sklearn'},
               {'name': "MLP"
                , 'clasifier': MLPClassifier(hidden_layer_sizes=(50, 40, 30, 20), max_iter=1000, random_state=42)
                , 'type': 'sklearn'},
               {'name': "Random Forest"
                , 'clasifier': RandomForestClassifier(
                                    max_depth=15, n_estimators=50, max_features=1, random_state=42
                                )
                , 'type': 'sklearn'
               },
              # {'name': "Decision Tree"
              #  , 'clasifier': DecisionTreeClassifier(max_depth=25, random_state=42)
              #  , 'type': 'sklearn'},
             #  {'name': "Gaussian Process"
             #   , 'clasifier': GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42)
             #   , 'type': 'sklearn'},
               
              ]

import mdct

def mdct_reshaped(x):
    MDCT = np.nan_to_num(mdct.mdst(x), 0)
    MDCT = MDCT[:128, :, :].reshape(128, -1)
    mx = np.max(np.abs(MDCT))
    if mx != 0:        
        MDCT = MDCT / mx
    return MDCT

import pywt

def wavelet_transformation(x):
    DWT = []
    for i in range(x.shape[-1]):
        DWT.append(np.vstack(pywt.dwt(x[:, i], 'coif17')))
    #DWT = np.concatenate(pywt.dwt(x, 'coif17'), axis=1)
    DWT = np.vstack(DWT)
    mx = np.max(np.abs(DWT))
    if mx != 0:
        DWT = DWT / mx
    return DWT.T

domeny = [
    {'name': 'TimeDomain'
     , 'full name' : ""
     , 'dataset_type': 'TimeDomain'
     , 'X_post_processing': None},
    {'name': 'rFFT'
     , 'full name' : "Fast Fourier Transformation for real values"
     , 'dataset_type': 'ClassicFourieMagnitude'
     , 'X_post_processing': None},
    {'name': 'FFTwDW'
     , 'full name' : "Fast Fourier Transformation with Dynamic window"
     , 'dataset_type': 'ImprovedFourieMagnitude'
     , 'X_post_processing': None},
    {'name': 'MDCT'
     , 'full name' : "Modified discrete cosine transform"
     , 'dataset_type': 'TimeDomain'
     , 'X_post_processing': {'function': mdct_reshaped, 'shape': (128, 108)}
     , 'shape_X' : (4096, 12)}, 
    {'name': 'WT'
     , 'full name' : "Wavelet transform"
     , 'dataset_type': 'TimeDomain'
     , 'X_post_processing': {'function': wavelet_transformation, 'shape': (2098, 24)}
     , 'shape_X' : (4096, 12)},
    #{'name': 'STFT-FD'
    # , 'full name' : "Short-Time Fourier Transform with the Window Size Fixed in the Frequency Domain"
    # , 'dataset_type': 'TimeDomain'
    # , 'X_post_processing': {'function': STFT_FD, 'shape': (4096, 12*510)}}
    ]

lr = 0.001


if not os.path.exists(logs_directory):
            # create dir
            os.mkdir(logs_directory) 
if not os.path.exists(metrics_file):
    # create metrics_file
    with open(metrics_file, 'w') as f:
        f.write( 'Clasifier;Domena;Task;Run;TrainDuration;TestType;F1;TP;FP;FN;TN\n')

print('Exploration start.')

count_rows_done = 0
#with open(metrics_file, 'r') as f:
#    count_rows_done = (len(f.read().split('\n'))-2)/2
#print(count_rows_done)


with open(metrics_file, 'r') as f:
    rows_processed = f.read().split('\n')
    
rows_processed_keys = [';'.join(r.split(';')[:4]) for r in rows_processed]
count_rows_done = len(rows_processed_keys)
print(count_rows_done)


Exploration start.
338


In [2]:
current_row = 0
batch_size = 64
for run in range(3):
    for task in [1, 2, 0, 3, 4, 5]:
        for domena in domeny: 
            

            # val_Y = np.array(val_Y, dtype='float32')   

            if 'X_post_processing' in domena.keys() and domena['X_post_processing'] is not None:
                X_shape = domena['X_post_processing']['shape']
            else:
                X_shape = (4096, 12)  
                
            # iterate over classifierss
            for clasifier in clasifiers:
                cl_name = clasifier['name']
                cl_type = clasifier['type']
                dm_name= domena['name']
                row_key = f"{cl_name};{dm_name};{task};{run}"
                if row_key in rows_processed_keys:
                    continue
                try:
                    del test_X
                    del test_Y
                except:
                    pass
                train_X, train_Y = read_and_random_undersampling_dataset(f'{WORKING_DIRECTORY}Data/CODE-15', 'train_US_ratio_1'
                                                          #, file_num = file_number
                                                          , dataset_type = domena['dataset_type']
                                                          , undersampling = False
                                                          , return_numpy_copy = True
                                                          , X_post_processing = domena['X_post_processing']
                                                        )

                print('Train dataset is read')
                train_Y = np.array(train_Y, dtype='float32')  
                train_start_time = datetime.now()
                print(f'Clasifier {cl_name}, task {task}, domena {dm_name}, run {run} start at {train_start_time}.')
                clf = clasifier['clasifier']
                
                try:
                    clf.fit(train_X, train_Y[:, task])
                except: 
                    clf.fit(train_X.reshape(train_X.shape[0], -1), train_Y[:, task])

                duration_delta = datetime.now() - train_start_time
                
                try:
                    del train_X
                    del train_Y
                except:
                    pass
                test_X, test_Y = read_and_random_undersampling_dataset(f'{WORKING_DIRECTORY}Data/CODE-15', 'test'
                                                                          , dataset_type = domena['dataset_type']
                                                                          , undersampling = False
                                                                          , return_numpy_copy = True
                                                                          , X_post_processing = domena['X_post_processing']
                                                                        )
                print('test dataset is read')
                atest_X, atest_Y  = read_and_random_undersampling_dataset(f'{WORKING_DIRECTORY}Data/CODE-15', 'atest'
                                                                          , dataset_type = domena['dataset_type']
                                                                          , undersampling = False
                                                                          , return_numpy_copy = True
                                                                          , X_post_processing = domena['X_post_processing']
                                                                         )
                print('test gold standart is read')      
                test_Y = np.array(test_Y, dtype='float32')            
                atest_Y = np.array(atest_Y, dtype='float32') 

                test_labels = test_Y[:, task] 
                atest_labels = atest_Y[:, task] 
            
                try:
                    test_prediction = clf.predict(test_X)
                except: 
                    test_prediction = clf.predict(test_X.reshape(test_X.shape[0], -1))
                try:
                    atest_prediction = clf.predict(atest_X)
                except: 
                    atest_prediction = clf.predict(atest_X.reshape(atest_X.shape[0], -1))

                duration = duration_delta.total_seconds()
                try:
                    test_precision, test_recall, test_treshold = precision_recall_curve(test_labels
                                                                       , test_prediction
                                                                     )
                    test_f1_array = np.nan_to_num(2 * test_precision * test_recall / (test_precision + test_recall))

                    th = test_treshold[np.argmax(test_f1_array)-1]                    

                    test_TP = (test_prediction > th)*test_labels
                    test_FP = (test_prediction > th)*(1-test_labels)
                    test_FN = (test_prediction <= th)*test_labels
                    test_TN = (test_prediction <= th)*(1-test_labels)

                    test_f1 = np.sum(test_TP) / (np.sum(test_TP) + 0.5*(np.sum(test_FP) + np.sum(test_FN)))
                except:
                    th = 0 

                    test_TP = (test_prediction > th)*test_labels
                    test_FP = (test_prediction > th)*(1-test_labels)
                    test_FN = (test_prediction <= th)*test_labels
                    test_TN = (test_prediction <= th)*(1-test_labels)
                    
                    test_f1 = -1
                
                try:
                    atest_precision, atest_recall, atest_treshold = precision_recall_curve(atest_labels
                                                                   , atest_prediction
                                                                 )
                    atest_f1_array = np.nan_to_num(2 * atest_precision * atest_recall / (atest_precision + atest_recall))

                    th = atest_treshold[np.argmax(atest_f1_array)-1]                    

                    atest_TP = (atest_prediction > th)*atest_labels
                    atest_FP = (atest_prediction > th)*(1-atest_labels)
                    atest_FN = (atest_prediction <= th)*atest_labels
                    atest_TN = (atest_prediction <= th)*(1-atest_labels)

                    atest_f1 = np.sum(atest_TP) / (np.sum(atest_TP) + 0.5*(np.sum(atest_FP) + np.sum(atest_FN)))
                except:
                    th = 0 

                    atest_TP = (atest_prediction > th)*atest_labels
                    atest_FP = (atest_prediction > th)*(1-atest_labels)
                    atest_FN = (atest_prediction <= th)*atest_labels
                    atest_TN = (atest_prediction <= th)*(1-atest_labels)
                    
                    atest_f1 = -1
                              
                with open(metrics_file, 'a') as f:
                    f.write( f'{cl_name};{dm_name};{task};{run};{duration};test;{test_f1};{np.sum(test_TP)};{np.sum(test_FP)};{np.sum(test_FN)};{np.sum(test_TN)}\n')  
                    f.write( f'{cl_name};{dm_name};{task};{run};{duration};atest;{atest_f1};{np.sum(atest_TP)};{np.sum(atest_FP)};{np.sum(atest_FN)};{np.sum(atest_TN)}\n')   
                    
                print(f'Clasifier {cl_name}, domena {dm_name}, task {task}, train_duration {duration}, run {run}: F1_test = {test_f1}, F1_atest = {atest_f1}')
                
                del clf

Shapes: (52934, 2098, 24) and (52934, 6)
Train dataset is read
Clasifier MLP, task 4, domena WT, run 2 start at 2024-04-26 22:03:55.251646.
Shapes: (34581, 2098, 24) and (34581, 6)
test dataset is read
Shapes: (827, 2098, 24) and (827, 6)
test gold standart is read
Clasifier MLP, domena WT, task 4, train_duration 8873.20845, run 2: F1_test = 0.36346060898985016, F1_atest = 0.038461538461538464
Shapes: (52934, 2098, 24) and (52934, 6)
Train dataset is read
Clasifier Random Forest, task 4, domena WT, run 2 start at 2024-04-27 00:49:50.474911.
Shapes: (34581, 2098, 24) and (34581, 6)
test dataset is read
Shapes: (827, 2098, 24) and (827, 6)
test gold standart is read
Clasifier Random Forest, domena WT, task 4, train_duration 1264.732047, run 2: F1_test = 0.0, F1_atest = 0.0
Shapes: (52934, 4096, 12) and (52934, 6)
Train dataset is read
Clasifier MLP, task 5, domena TimeDomain, run 2 start at 2024-04-27 01:21:00.570406.
Shapes: (34581, 4096, 12) and (34581, 6)
test dataset is read
Shapes: 

  atest_f1_array = np.nan_to_num(2 * atest_precision * atest_recall / (atest_precision + atest_recall))


Clasifier MLP, domena FFTwDW, task 5, train_duration 454.315415, run 2: F1_test = 0.43585933630510154, F1_atest = 0.0
Shapes: (52934, 500, 12) and (52934, 6)
Train dataset is read
Clasifier Random Forest, task 5, domena FFTwDW, run 2 start at 2024-04-27 03:08:34.810715.
Shapes: (34581, 500, 12) and (34581, 6)
test dataset is read
Shapes: (827, 500, 12) and (827, 6)
test gold standart is read
Clasifier Random Forest, domena FFTwDW, task 5, train_duration 3.322511, run 2: F1_test = 0.0, F1_atest = 0.0
Shapes: (52934, 128, 108) and (52934, 6)
Train dataset is read
Clasifier MLP, task 5, domena MDCT, run 2 start at 2024-04-27 03:38:41.354929.
Shapes: (34581, 128, 108) and (34581, 6)
test dataset is read
Shapes: (827, 128, 108) and (827, 6)
test gold standart is read
Clasifier MLP, domena MDCT, task 5, train_duration 357.27353, run 2: F1_test = 0.07119741100323625, F1_atest = 0.0
Shapes: (52934, 128, 108) and (52934, 6)
Train dataset is read
Clasifier Random Forest, task 5, domena MDCT, run

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(np.concatenate([[0], atest_treshold]), atest_f1_array)

In [None]:
th = atest_treshold[np.argmax(atest_f1_array)-1]
print(f'treshold = {th}')

TP = (atest_prediction > th)*atest_labels
FP = (atest_prediction > th)*(1-atest_labels)
FN = (atest_prediction <= th)*atest_labels
TN = (atest_prediction <= th)*(1-atest_labels)

f1 = np.sum(TP) / (np.sum(TP) + 0.5*(np.sum(FP) + np.sum(FN)))
print(f'F1_calk = {f1}')