# imports

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sktime.classification.kernel_based import RocketClassifier
from sktime.transformations.panel.rocket import MiniRocketMultivariate
from sktime.transformations.panel.rocket import MultiRocketMultivariate
from sktime.transformations.panel.rocket import MultiRocket
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sktime.transformations.panel.rocket import MiniRocketMultivariate
from sktime.classification.deep_learning import InceptionTimeClassifier
from sktime.classification.hybrid import HIVECOTEV2
from sktime.classification.compose import ColumnEnsembleClassifier
from sklearn import preprocessing
import torch.nn as nn
from sktime.regression.kernel_based import RocketRegressor
from sktime.regression.compose import ComposableTimeSeriesForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve,roc_auc_score
import sys
from sklearn import preprocessing
from torch.utils.data import TensorDataset, DataLoader
from scipy import stats, signal
import math
import joblib

In [2]:
def from_2d_array_to_nested(X, index=None, columns=None, time_index=None, cells_as_numpy=False):
    """Convert 2D dataframe to nested dataframe.
    Convert tabular pandas DataFrame with only primitives in cells into
    nested pandas DataFrame with a single column.
    Parameters
    ----------
    X : pd.DataFrame
    cells_as_numpy : bool, default = False
        If True, then nested cells contain NumPy array
        If False, then nested cells contain pandas Series
    index : array-like, shape=[n_samples], optional (default = None)
        Sample (row) index of transformed DataFrame
    time_index : array-like, shape=[n_obs], optional (default = None)
        Time series index of transformed DataFrame
    Returns
    -------
    Xt : pd.DataFrame
        Transformed DataFrame in nested format
    """
    if (time_index is not None) and cells_as_numpy:
        raise ValueError(
            "`Time_index` cannot be specified when `return_arrays` is True, "
            "time index can only be set to "
            "pandas Series"
        )
    if isinstance(X, pd.DataFrame):
        X = X.to_numpy()

    container = np.array if cells_as_numpy else pd.Series

    # for 2d numpy array, rows represent instances, columns represent time points
    n_instances, n_timepoints = X.shape

    if time_index is None:
        time_index = np.arange(n_timepoints)
    kwargs = {"index": time_index}

    Xt = pd.DataFrame(
        pd.Series([container(X[i, :], **kwargs) for i in range(n_instances)])
    )
    if index is not None:
        Xt.index = index
    if columns is not None:
        Xt.columns = columns
    return Xt

def best_indices(coef, x=2000, twoD=True):
    # Create a boolean array of coefficients that are greater than or equal to 0.002

    abs_coef = np.abs(coef)

    sorted_indices = np.argsort(abs_coef)  
    if twoD == True:# indices of the x largest coefficients
    #top_x_indices = top_x_indices.iloc[0,-x:]
        topx=[]
        for i in np.arange(0,sorted_indices.shape[0]):
            top_x_indices = sorted_indices.reshape(-1)
            top_x_indices = top_x_indices[-x:]
            topx.append(top_x_indices)
        top_x_indices = np.concatenate(topx)
    else:
        top_x_indices = sorted_indices.reshape(-1)
        top_x_indices = top_x_indices[-x:]
    top_x_indices.sort()
    top_x_indices = np.unique(top_x_indices)
    return top_x_indices

In [3]:
def get_additional_features(split, torque, speed):
    additional_features = []
    for i,ts in enumerate(split):
        stats_df = []
        speed_torque_dict = {
        'speed_1': speed[i],
        'torque_1': torque[i]}
        speed_torque_df = pd.DataFrame.from_dict(speed_torque_dict, orient='index', columns=['Value'])
        for i in range(ts.shape[1]):
            mean = np.mean(ts[:,i])
            std_dev = np.std(ts[:,i])
            variance = np.var(ts[:,i])
            skewness = stats.skew(ts[:,i])
            kurtosis = stats.kurtosis(ts[:,i])
            autocorr = pd.Series(ts[:,i]).autocorr(lag=1)
            frequencies, density = signal.welch(ts[:,i], nperseg=256)
            density_df = pd.DataFrame(density.transpose()).transpose()
            outlier_list = []
            for j in [1.5,1.75,2,2.25,2.5,2.75,3,3.25,3.5,3.75,4, 4.25,4.5, 4.75, 5, 5.25,5.5,5.75,6]:
                threshold = j * std_dev
                nmb_outliers = np.sum(np.abs(ts[:,i] - mean) > threshold)
                outlier_list.append(nmb_outliers)
            outlier_df = pd.DataFrame([outlier_list])
            stats_dict = {
            'Mean': mean,
            'Standard Deviation': std_dev,
            'Variance': variance,
            'Skewness': skewness,
            'Kurtosis': kurtosis,
            'Auto-correlation (lag 1)': autocorr}
            stats_dict = pd.DataFrame.from_dict(stats_dict, orient='index', columns=['0']).reset_index(drop=True).transpose()
            density_df = density_df.reset_index(drop=True)
            stats_dict.index = density_df.index
            outlier_df.index = density_df.index
            stats_df.append(pd.concat([stats_dict, density_df, outlier_df], axis=1))
        additional_features.append(np.concatenate([np.array(speed_torque_df.transpose()), np.concatenate(stats_df, axis=1)], axis=1))


    # Create a DataFrame from the dictionary
    #stats_df = pd.DataFrame.from_dict(stats_dict, orient='index', columns=['Value'])
    return additional_features


def get_additional_features2(split, torque, speed):
    additional_features = []
    for i,ts in enumerate(split):
        stats_df = []

        speed_torque_dict = {
        'speed_1': speed[i],
        'torque_1': torque[i]}

        speed_torque_df = pd.DataFrame.from_dict(speed_torque_dict, orient='index', columns=['Value'])
        for i in range(ts.shape[1]):

            mean = np.mean(ts[i])
            std_dev = np.std(ts[i])
            variance = np.var(ts[i])
            skewness = stats.skew(ts[i])
            kurtosis = stats.kurtosis(ts[i])
            autocorr = pd.Series(ts[i]).autocorr(lag=1)
            frequencies, density = signal.welch(ts[i], nperseg=256)
            density_df = pd.DataFrame(density.transpose()).transpose()
            outlier_list = []
            for j in [1.5,1.75,2,2.25,2.5,2.75,3,3.25,3.5,3.75,4, 4.25,4.5, 4.75, 5, 5.25,5.5,5.75,6]:
                threshold = j * std_dev
                nmb_outliers = np.sum(np.abs(ts[i] - mean) > threshold)
                outlier_list.append(nmb_outliers)
            outlier_df = pd.DataFrame([outlier_list])
            stats_dict = {
            'Mean': mean,
            'Standard Deviation': std_dev,
            'Variance': variance,
            'Skewness': skewness,
            'Kurtosis': kurtosis,
            'Auto-correlation (lag 1)': autocorr}
            stats_dict = pd.DataFrame.from_dict(stats_dict, orient='index', columns=['0']).reset_index(drop=True).transpose()
            density_df = density_df.reset_index(drop=True)
            stats_dict.index = density_df.index
            outlier_df.index = density_df.index
            stats_df.append(pd.concat([stats_dict, density_df, outlier_df], axis=1))
        additional_features.append(np.concatenate([np.array(speed_torque_df.transpose()), np.concatenate(stats_df, axis=1)], axis=1))


    # Create a DataFrame from the dictionary
    #stats_df = pd.DataFrame.from_dict(stats_dict, orient='index', columns=['Value'])
    return additional_features

In [4]:
def cut_dataframes(array_list, Drehzahl_list, l=1):
    
    new_list = []
    factor = 120/216
    for i, arr in enumerate(array_list):

        speed = Drehzahl_list[i]
        if speed in [100,200]:
            time=12
        elif speed in [300,400,500,600,700,800,900,1000]:
            time =6
        else:
            time=3         
        target_length = int((l/((speed/60) * factor)) * 20480)

        if len(arr)>=target_length:
            arr = arr[:target_length,:]
        else:
            zeros_to_append = target_length - len(arr)
            zeros_df = pd.DataFrame(0, index=range(zeros_to_append), columns=pd.DataFrame(arr).columns)
            arr = pd.concat([pd.DataFrame(arr), zeros_df])
        new_list.append(arr)

    return new_list

def cut_dataframes_all(array_list, Drehzahl_list):
    
    new_list = []
    factor = 120/216
    for i, arr in enumerate(array_list):

        speed = Drehzahl_list[i]
        if speed in [100,200]:
            time=12
        elif speed in [300,400,500,600,700,800,900,1000]:
            time =6
        else:
            time=3         
        l = math.floor((speed/60) * factor * time)
        target_length = int((l/((speed/60) * factor)) * 20480)
        
        if len(arr)>=target_length:
            arr = arr[:target_length,:]
        else:
            zeros_to_append = target_length - len(arr)
            zeros_df = pd.DataFrame(0, index=range(zeros_to_append), columns=pd.DataFrame(arr).columns)
            arr = pd.concat([arr, zeros_df])
        new_list.append(arr)

    return new_list

In [5]:
def data_splitting(df_list, torque, speed, l=1):
    split_data = []
    torque_split = []
    speed_split = []

    for j, df in enumerate(df_list):
        tmp = 0
        indices = np.where(np.array(df)[:, 3] == 1)
        if speed[j] in [100,300]:
            reset=l
        elif speed[j] in [200,400, 500, 600, 1200]:
            reset=l-1    
        elif speed[j] in [700, 800]:
            reset=l-2  
        elif speed[j] ==900:
            reset=l-3     
        else:
            reset=l-4    
        for i, index in enumerate(indices[0]):
            if tmp<l:
                tmp = tmp+1
            else:
                split_data.append(np.array(df.iloc[indices[0][i-l]:index, [0, 1, 2]]))
                torque_split.append(torque[j])
                speed_split.append(speed[j])

                tmp=reset
    #split_data = np.concatenate(split_data, axis=2)  
    return split_data, speed_split, torque_split

In [6]:
import numpy as np
from scipy.ndimage import zoom
def interpolate_dataframes(array_list, l=1):
    target_length = 22202 * l
    new_list = []
    for arr in array_list:
        zoom_rate = target_length / arr.shape[0]
        arr = zoom(arr, zoom_rate)
        new_list.append(arr)
    return new_list

In [4]:
predictions=np.array([1.2,3.2,3.7,4.1,2.1,4.5,1.8,2.9,3.2,4.8,2.3,1.5])
speed=np.array([2700,2700,2700,2700,2700,3400,3400,4000,4000,5000,5000,5000])
index=np.array([0,0,0,0,0,1,1,2,2,3,3,3])


In [None]:
def make_submission

# Load test data

train_path = "C:/Users/je009447/Downloads/Data_Challenge_PHM2023_test_data"
dataframes = []
torque = []
speed = []
file_names = sorted([file_name for file_name in os.listdir(train_path) if file_name.endswith(".txt")],
                    key=lambda x: int(x.split("_")[0]))

for file_name in file_names:
    file_path = os.path.join(train_path, file_name)
    
    with open(file_path, "r") as file:
        content = file.read()

            # Extract numbers from the file name
    speed_number = int(file_name.split("_")[1][1:])
    torque_number = int(file_name.split("_")[2][:-5])
    print(file_name)
    # Save the numbers
    speed.append(speed_number)
    torque.append(torque_number)
    
    df = pd.read_csv(file_path, delimiter=' ', header = None)
    dataframes.append(df)

# Load model

# Preprocess_data

## single split

In [114]:
import scipy.interpolate as interp
def interpolate_dataframes(array_list, l=1):
    target_length = 3687 * l
    new_list = []

    for i, arr in enumerate(array_list):
        arr_1 = interp.interp1d(np.arange(arr[:,0].size),arr[:,0])
        arr_1 = arr_1(np.linspace(0,arr[:,0].size-1,target_length))
        arr_2 = interp.interp1d(np.arange(arr[:,0].size),arr[:,0])
        arr_2 = arr_2(np.linspace(0,arr[:,0].size-1,target_length))
        arr_3 = interp.interp1d(np.arange(arr[:,0].size),arr[:,0])
        arr_3 = arr_3(np.linspace(0,arr[:,0].size-1,target_length))
        interpolated_array = np.concatenate([arr_1.reshape(-1,1), arr_2.reshape(-1,1), arr_3.reshape(-1,1)], axis=1)
        new_list.append(interpolated_array)

    return new_list

In [115]:
from sktime.datasets import write_dataframe_to_tsfile
df_test =[]
for i in np.arange(1,801):
    print(i)
    n=f"{i}_"
    train_path = f"D:/Projects/PHM_2023/Data_Challenge_PHM2023_test_data"

    dataframes = []
    torque = []
    speed = []

    for x,file_name in enumerate(os.listdir(train_path)):
        if file_name.endswith(".txt") and file_name.startswith(f"{n}"):
            file_path = os.path.join(train_path, file_name)

            with open(file_path, "r") as file:
                content = file.read()

                    # Extract numbers from the file name

            speed_number = int(file_name.split("_")[1][1:])
            torque_number = int(file_name.split("_")[2][:-5])
            print(speed_number)
            # Save the numbers
            speed.append(speed_number)
            torque.append(torque_number)
            
            df = pd.read_csv(file_path, delimiter=' ', header = None)
            if speed_number in [1500,1800,2400]:
                dataframes.append(df)
    print(speed_number)            
    if speed_number not in [1500,1800,2400]:
        continue
    split_data, speed_split, torque_split = data_splitting(dataframes, torque, speed, l=5)    
    additional_features = get_additional_features(split_data, speed_split, torque_split)    
    additional_features = pd.DataFrame(np.concatenate(additional_features))
    additional_features.to_csv(f'C:/PHM_2023_Datadump/test_data/multi/test_additional_split/data_{i}_l5_add.csv', index=False)

    interpolated_ts = interpolate_dataframes(split_data, l=5)

    mrm_model = joblib.load('mini_model_1024.pkl')
    save_path =f"C:/PHM_2023_Datadump/test_data/multi/Minirocket_1024_split/"
    # Iterate over the list of dataframes

    for j in np.arange(0,len(interpolated_ts)):
        df_2d = interpolated_ts[j].transpose()
        df_nested = from_2d_array_to_nested(df_2d)
        df1 = df_nested.transpose()
        if j==0:
            df=df1
        else: 
            df = pd.concat([df,df1], axis=0)
    df.reset_index(inplace=True, drop=True)
    if os.path.exists("C:/PHM_2023_Datadump/mini_rocket_1024"):
        # Load the model from file
        mrm = joblib.load("C:/PHM_2023_Datadump/mini_rocket_1024")

        df_transformed = mrm.transform(df)
    else:
        mrm = MiniRocketMultivariate(max_dilations_per_kernel = 1024,num_kernels=10000, n_jobs=-1) 
        df_transformed = mrm.fit_transform(df)
        joblib.dump(mrm, "C:/PHM_2023_Datadump/mini_rocket_1024")
    # Create the file name based on the values
    file_name = f"data_{i}_{speed_number}.csv"
    file_path = os.path.join(save_path, file_name)

    df_transformed = np.concatenate([pd.DataFrame(df_transformed), pd.DataFrame(speed_split), pd.DataFrame(torque_split)],axis=1 )
    #df_test.append(df_transformed)
    pd.DataFrame(df_transformed).to_csv(file_path, index=False)

1
1500
1500
2
2700
2700
3
300
300
4
2400
2400
5
700
700
6
2100
2100
7
600
600
8
1000
1000
9
1000
1000
10
100
100
11
2100
2100
12
1000
1000
13
500
500
14
3000
3000
15
400
400
16
2400
2400
17
300
300
18
700
700
19
1000
1000
20
800
800
21
200
200
22
900
900
23
100
100
24
1200
1200
25
400
400
26
500
500
27
3000
3000
28
200
200
29
1000
1000
30
700
700
31
400
400
32
1800
1800
33
600
600
34
2700
2700
35
100
100
36
1800
1800
37
1200
1200
38
300
300
39
400
400
40
2400
2400
41
100
100
42
600
600
43
100
100
44
800
800
45
300
300
46
200
200
47
700
700
48
200
200
49
400
400
50
300
300
51
1200
1200
52
200
200
53
1800
1800
54
800
800
55
2100
2100
56
300
300
57
400
400
58
800
800
59
1500
1500
60
300
300
61
100
100
62
2700
2700
63
1000
1000
64
500
500
65
2100
2100
66
1200
1200
67
300
300
68
600
600
69
700
700
70
600
600
71
3000
3000
72
800
800
73
2400
2400
74
900
900
75
700
700
76
1500
1500
77
900
900
78
800
800
79
600
600
80
1000
1000
81
1800
1800
82
400
400
83
900
900
84
1200
1200
85
400
400
86
400
4

In [18]:
from sktime.datasets import write_dataframe_to_tsfile
df_test =[]
for i in np.arange(1,801):
    print(i)
    n=f"{i}_"
    train_path = f"D:/Projects/PHM_2023/Data_Challenge_PHM2023_test_data"

    dataframes = []
    torque = []
    speed = []

    for x,file_name in enumerate(os.listdir(train_path)):
        if file_name.endswith(".txt") and file_name.startswith(f"{n}"):
            file_path = os.path.join(train_path, file_name)

            with open(file_path, "r") as file:
                content = file.read()

                    # Extract numbers from the file name

            speed_number = int(file_name.split("_")[1][1:])
            torque_number = int(file_name.split("_")[2][:-5])
            print(speed_number)
            # Save the numbers
            speed.append(speed_number)
            torque.append(torque_number)
            
            df = pd.read_csv(file_path, delimiter=' ', header = None)
            dataframes.append(df)

    split_data, speed_split, torque_split = data_splitting(dataframes, torque, speed, l=5)    
    additional_features = get_additional_features(split_data, speed_split, torque_split)    
    additional_features = pd.DataFrame(np.concatenate(additional_features))
    additional_features.to_csv(f'C:/PHM_2023_Datadump/test_data/solo/test_additional/data_{i}_l5_add.csv', index=False)

    interpolated_ts = cut_dataframes(split_data, speed_split, l=5)

    mrm_model = joblib.load('mini_model_1024.pkl')
    save_path =f"C:/PHM_2023_Datadump/test_data/solo/Minirocket_1024/"
    # Iterate over the list of dataframes

    for j in np.arange(0,len(interpolated_ts)):
        df_2d = interpolated_ts[j].transpose()
        df_nested = from_2d_array_to_nested(df_2d)
        df1 = df_nested.transpose()
        if j==0:
            df=df1
        else: 
            df = pd.concat([df,df1], axis=0)
    df.reset_index(inplace=True, drop=True)
    if os.path.exists("C:/PHM_2023_Datadump/mini_rocket_1024"):
        # Load the model from file
        mrm = joblib.load("C:/PHM_2023_Datadump/mini_rocket_1024")

        df_transformed = mrm.transform(df)
    else:
        mrm = MiniRocketMultivariate(max_dilations_per_kernel = 1024,num_kernels=10000, n_jobs=-1) 
        df_transformed = mrm.fit_transform(df)
        joblib.dump(mrm, "C:/PHM_2023_Datadump/mini_rocket_1024")
    # Create the file name based on the values
    file_name = f"data_{i}_{speed_number}.csv"
    file_path = os.path.join(save_path, file_name)

    df_transformed = np.concatenate([pd.DataFrame(df_transformed), pd.DataFrame(speed_split), pd.DataFrame(torque_split)],axis=1 )
    #df_test.append(df_transformed)
    pd.DataFrame(df_transformed).to_csv(file_path, index=False)

1
1500
2
2700
3
300
4
2400
5
700
6
2100
7
600
8
1000
9
1000
10
100
11
2100
12
1000
13
500
14
3000
15
400
16
2400
17
300
18
700
19
1000
20
800
21
200
22
900
23
100
24
1200
25
400
26
500
27
3000
28
200
29
1000
30
700
31
400
32
1800
33
600
34
2700
35
100
36
1800
37
1200
38
300
39
400
40
2400
41
100
42
600
43
100
44
800
45
300
46
200
47
700
48
200
49
400
50
300
51
1200
52
200
53
1800
54
800
55
2100
56
300
57
400
58
800
59
1500
60
300
61
100
62
2700
63
1000
64
500
65
2100
66
1200
67
300
68
600
69
700
70
600
71
3000
72
800
73
2400
74
900
75
700
76
1500
77
900
78
800
79
600
80
1000
81
1800
82
400
83
900
84
1200
85
400
86
400
87
1200
88
200
89
2700
90
900
91
1000
92
700
93
100
94
900
95
3000
96
2400
97
2700
98
2400
99
800
100
3000
101
3000
102
300
103
300
104
1800
105
1800
106
500
107
800
108
400
109
1200
110
1200
111
1000
112
1500
113
1800
114
600
115
2100
116
600
117
2400
118
800
119
800
120
500
121
1200
122
2100
123
1500
124
1800
125
1000
126
700
127
600
128
2100
129
200
130
1500
131
1500
1

In [None]:
from sktime.datasets import write_dataframe_to_tsfile
df_test =[]
for i in np.arange(1,801):
    print(i)
    n=f"{i}_"
    train_path = f"D:/Projects/PHM_2023/Data_Challenge_PHM2023_test_data"

    dataframes = []
    torque = []
    speed = []

    for x,file_name in enumerate(os.listdir(train_path)):
        if file_name.endswith(".txt") and file_name.startswith(f"{n}"):
            file_path = os.path.join(train_path, file_name)

            with open(file_path, "r") as file:
                content = file.read()

                    # Extract numbers from the file name

            speed_number = int(file_name.split("_")[1][1:])
            torque_number = int(file_name.split("_")[2][:-5])
            print(speed_number)
            # Save the numbers
            speed.append(speed_number)
            torque.append(torque_number)
            
            df = pd.read_csv(file_path, delimiter=' ', header = None)
            dataframes.append(df)

        additional_features = get_additional_features2(dataframes, speed, torque)    
        additional_features = pd.DataFrame(np.concatenate(additional_features))
        additional_features.to_csv(f'C:/PHM_2023_Datadump/solo/train_additional/{d}/data_{i}_add.csv', index=False)

        interpolated_ts = cut_dataframes_all(dataframes, speed)

    mrm_model = joblib.load('mini_model_1024.pkl')
    save_path =f"C:/PHM_2023_Datadump/test_data/solo/Minirocket_1024/"
    # Iterate over the list of dataframes

    for j in np.arange(0,len(interpolated_ts)):
        df_2d = interpolated_ts[j].transpose()
        df_nested = from_2d_array_to_nested(df_2d)
        df1 = df_nested.transpose()
        if j==0:
            df=df1
        else: 
            df = pd.concat([df,df1], axis=0)
    df.reset_index(inplace=True, drop=True)
    if os.path.exists("C:/PHM_2023_Datadump/mini_rocket_1024"):
        # Load the model from file
        mrm = joblib.load("C:/PHM_2023_Datadump/mini_rocket_1024")

        df_transformed = mrm.transform(df)
    else:
        mrm = MiniRocketMultivariate(max_dilations_per_kernel = 1024,num_kernels=10000, n_jobs=-1) 
        df_transformed = mrm.fit_transform(df)
        joblib.dump(mrm, "C:/PHM_2023_Datadump/mini_rocket_1024")
    # Create the file name based on the values
    file_name = f"data_{i}_{speed_number}.csv"
    file_path = os.path.join(save_path, file_name)

    df_transformed = np.concatenate([pd.DataFrame(df_transformed), pd.DataFrame(speed_split), pd.DataFrame(torque_split)],axis=1 )
    #df_test.append(df_transformed)
    pd.DataFrame(df_transformed).to_csv(file_path, index=False)

In [7]:
def aggregate_predictions(index, y_pred):    
    columns_dict = {i: [] for i in range(11)}

    current_index = None
    row_values = [0] * 11

    # Iterate over the index and y_pred arrays
    for idx, pred in zip(index, y_pred):
        # Check if the index has changed
        if idx != current_index:
            # Add the row values to the respective columns in the dictionary
            for i, value in enumerate(row_values):
                columns_dict[i].append(value)
            
            # Reset the row values
            row_values = [0] * 11
            current_index = idx
        
        # Increment the column based on the value of y_pred
        if pred == 0:
            row_values[0] += 1
        elif pred > 0:
            column_index = int(np.round(min(pred, 10)))
            row_values[column_index] += 1
        else:
            row_values[0] += 1

    # Add the last row values to the respective columns in the dictionary
    for i, value in enumerate(row_values):
        columns_dict[i].append(value)

    df = pd.DataFrame(columns_dict)
    df = df.drop(0)
    df = df.div(df.sum(axis=1), axis=0).round(1)

    return np.array(df)

In [8]:
def get_certainty(y_pred, speed):
    certainty=1
    if speed in [1500,1800,2400]:
        certainty = 0.2
    if np.std(y_pred)>1.2:
        certainty = 0.2      
    factor_check = 0
    for pseudo_label in [0,1,2,3,4,5,6,7,8,9,10]:
        points_case= 0
        for k, prediction in enumerate(y_pred[0]):
            diff = abs(k-pseudo_label)
            
            points_case = points_case + prediction*(1-diff*0.5)
        if points_case>0:
            factor_check=1
            break
    if factor_check == 0:
        certainty = 0.2
    points_case= 0       
    return(certainty)

In [9]:
def predict_with_correct_model_split(df, speed, torque):
    if speed[0] in [1500,1800,2400]:
        lin = joblib.load("C:/PHM_2023_Datadump/model/solo_split/all_speeds.joblib")
    else:
        lin = joblib.load(f"C:/PHM_2023_Datadump/model/solo_split/V{int(speed[0])}__{torque[0]}.joblib")
        print(speed[0],torque[0])
    
    y_pred_org = lin.predict(df)    
    y_pred  = aggregate_predictions(np.zeros_like(y_pred_org), y_pred_org)
    certainty = get_certainty(y_pred, speed[0])
    return y_pred, certainty, y_pred_org

## with split

In [None]:
all_preds = []
certainty = []
for i in np.arange(1,801):
    data_path = f"C:\PHM_2023_Datadump/test_data/solo/Minirocket_1024"
    data_path_add = f"C:\PHM_2023_Datadump/test_data/solo/train_additional"
    for file_name in os.listdir(data_path):
        if file_name.endswith(".csv") and file_name.startswith(f"data_{i}"):
            file_path = os.path.join(data_path, file_name)
            
            with open(file_path, "r") as file:
                content = file.read()

            df = pd.read_csv(file_path)
            file_path_add = os.path.join(data_path, file_name)
            for file_name_add in os.listdir(data_path_add):
                if file_name_add.endswith(".csv") and file_name_add.startswith(f"data_{i}"):
                    file_path_add = os.path.join(data_path_add, file_name_add)
        
                with open(file_path_add, "r") as file:
                    content = file.read()
                df_add = pd.read_csv(file_path_add)    
            speed_number = int(file_name.split("_")[2][-5:])
            df = pd.DataFrame(df)
            speed = df[9996]
            torque = df[9997]
            df.drop(columns=[9996, 9997], inplace=True)
            df_all = np.concatenate([df, df_add],axis=1)
            y_pred, certainty_est = predict_with_correct_model_split(df_all, speed, torque)
            all_preds.append(y_pred)
            certainty.append(certainty_est)
submission = np.concatenate(all_preds)
certainty = np.concatenate(certainty)
submission = np.concatenate([submission, certainty], axis=1)


In [137]:
all_preds = []
certainty = []
for i in np.arange(1,801):
    data_path = f"C:\PHM_2023_Datadump/test_data/solo/Minirocket_1024"
    data_path_add = f"C:\PHM_2023_Datadump/test_data/solo/test_additional"
    for file_name in os.listdir(data_path):
        if file_name.endswith(".csv") and file_name.startswith(f"data_{i}_"):
            speed_number = int(file_name.split("_")[2][:-4])

            if speed_number in [1500, 1800, 2400]:

                data_path = f"C:\PHM_2023_Datadump/test_data/multi/Minirocket_1024_split"
                data_path_add = f"C:\PHM_2023_Datadump/test_data/multi/test_additional_split"

            file_path = os.path.join(data_path, file_name)    
            with open(file_path, "r") as file:
                content = file.read()

            df = pd.read_csv(file_path)

            file_path_add = os.path.join(data_path, file_name)
            for file_name_add in os.listdir(data_path_add):
                if file_name_add.endswith(".csv") and file_name_add.startswith(f"data_{i}_"):
                    file_path_add = os.path.join(data_path_add, file_name_add)
                else:
                    continue

                with open(file_path_add, "r") as file:
                    content = file.read()
                df_add = pd.read_csv(file_path_add)    
                #speed_number = int(file_name.split("_")[2][-5:])
                df = pd.DataFrame(df)
                
                speed = df["9996"]
                torque = df["9997"]
                print(i)

                df.drop(columns=["9996", "9997"], inplace=True)
                df_all = np.concatenate([df, df_add],axis=1)
                # condition = ~np.isin(df_all[:, 9996], [1500, 1800, 2400])
                # df_sel = df_all[condition]
                # speed_sel=speed[condition]
                # torque_sel=torque[condition]
                print(speed)
                print(torque)
                y_pred, certainty_est, y_pred_org = predict_with_correct_model_split(df_all, speed, torque)
                print(y_pred_org)
                all_preds.append(y_pred)
                certainty.append(certainty_est)
submission = np.concatenate(all_preds)
certainty = np.array(certainty).reshape(-1,1)
submission = np.concatenate([submission, certainty], axis=1)

1
0    1500.0
1    1500.0
2    1500.0
3    1500.0
4    1500.0
5    1500.0
6    1500.0
7    1500.0
Name: 9996, dtype: float64
0    100.0
1    100.0
2    100.0
3    100.0
4    100.0
5    100.0
6    100.0
7    100.0
Name: 9997, dtype: float64
[ -9.50980911 -11.76829735 -11.10118599 -11.32810292  -9.6602822
  -9.61196748 -10.86465274 -10.44310122]
2
0     2700.0
1     2700.0
2     2700.0
3     2700.0
4     2700.0
5     2700.0
6     2700.0
7     2700.0
8     2700.0
9     2700.0
10    2700.0
11    2700.0
12    2700.0
13    2700.0
14    2700.0
Name: 9996, dtype: float64
0     50.0
1     50.0
2     50.0
3     50.0
4     50.0
5     50.0
6     50.0
7     50.0
8     50.0
9     50.0
10    50.0
11    50.0
12    50.0
13    50.0
14    50.0
Name: 9997, dtype: float64
2700.0 50.0
[-214.91736424 -233.16600913 -221.19671925 -210.12683354 -233.00162414
 -198.68904183 -239.71786563 -221.74465857 -239.26988011 -230.39778014
 -236.35223265 -243.89140055 -238.51898085 -230.25602899 -235.30757785]
3
0     300.

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 6 and the array at index 1 has size 15

In [98]:
certainty = np.array(certainty).reshape(-1,1)

In [99]:
submission = np.concatenate([submission, certainty], axis=1)

In [123]:
submission


array([[1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 1., 1.]])

In [124]:
import csv
sample_ids = np.arange(1, submission.shape[0] + 1).reshape(-1, 1)

# Adding sample IDs as the first column to the array
data_array_with_ids = np.hstack((sample_ids, submission))

# Define the CSV file path
csv_file_path = "output.csv"

# Write the data to the CSV file with the specified header
header = ["sample_id", "prob_0", "prob_1", "prob_2", "prob_3", "prob_4", "prob_5", "prob_6", "prob_7", "prob_8", "prob_9", "prob_10", "confidence"]

with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(data_array_with_ids)

In [31]:
df_filtered = df_all[~df_all["9996"].isin([1500, 1800, 2400])]

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [25]:
df.drop(columns=["9996", "9997"], inplace=True)

In [27]:
df_sel=df_all[df_all[9996] not in [1500,1800,2400]]
speed_sel=speed[df[9996] not in [1500,1800,2400]]
torque_sel=torque[df[9996] not in [1500,1800,2400]]

NameError: name 'df_all' is not defined

## without split

In [None]:
all_preds = []
certainty = []
for i in np.arange(1,801):
    data_path = f"C:\PHM_2023_Datadump/test_data/solo/Minirocket_1024"
    data_path_add = f"C:\PHM_2023_Datadump/test_data/solo/train_additional"
    for file_name in os.listdir(data_path):
        if file_name.endswith(".csv") and file_name.startswith(f"data_{i}"):
            file_path = os.path.join(data_path, file_name)
            
            with open(file_path, "r") as file:
                content = file.read()

            df = pd.read_csv(file_path)
            file_path_add = os.path.join(data_path, file_name)
            for file_name_add in os.listdir(data_path_add):
                if file_name_add.endswith(".csv") and file_name_add.startswith(f"data_{i}"):
                    file_path_add = os.path.join(data_path_add, file_name_add)
        
                with open(file_path_add, "r") as file:
                    content = file.read()
                df_add = pd.read_csv(file_path_add)    
            speed_number = int(file_name.split("_")[2][-5:])
            df = pd.DataFrame(df)
            speed = df[9996]
            torque = df[9997]
            df.drop(columns=[9996], inplace=True)
            df_all = np.concatenate([df, df_add],axis=1)
            y_pred = predict_with_correct_model(df_all, speed, torque)
            all_preds.append(y_pred)

In [None]:
identification = []
data_path = f"C:\PHM_2023_Datadump/test_data/solo/Minirocket_1024"
data_path_add = f"C:\PHM_2023_Datadump/test_data/solo/train_additional"
dataframes=[]
for file_name in os.listdir(data_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(data_path, file_name)
        
        with open(file_path, "r") as file:
            content = file.read()
        identification_number = int(file_name.split("_")[1][:])
        identification.append(identification_number)
        df = pd.read_csv(file_path)
        dataframes.append(df)
        speed_number = int(file_name.split("_")[1][1:])
df = np.concatenate(dataframes)

In [None]:
df = pd.DataFrame(df)
label = df[9998]
speed = df[9996]
torque = df[9997]
df.drop(columns=[9996, 9998], inplace=True)

In [None]:
data_path = f"C:\PHM_2023_Datadump/test_data/solo/train_additional"
dataframes=[]
for file_name in os.listdir(data_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(data_path, file_name)
        
        with open(file_path, "r") as file:
            content = file.read()
        
        df_tmp = pd.read_csv(file_path)
        dataframes.append(df_tmp)
df_add = np.concatenate(dataframes)

In [None]:
split_data, speed_split, torque_split, index = data_splitting(dataframes, torque, speed, l=1)
print("stage 1 ")
#del dataframes
additional_features = get_additional_features(split_data, speed_split, torque_split)
print("stage 2 ")
interpolated_ts = interpolate_dataframes(split_data)
print("stage 3 ")

In [None]:
for i in np.arange(0,len(interpolated_ts)):
    df_high_2d = interpolated_ts[i].transpose()
    df_high_nested = from_2d_array_to_nested(df_high_2d)
    df_high1 = df_high_nested.transpose()
    if i ==0:
        df_high=df_high1
    else: 
        df_high = pd.concat([df_high,df_high1], axis=0)
df_high.reset_index(inplace=True, drop=True)
del interpolated_ts

## Rocket

In [None]:
df_transformed = mrm.transform(df_high) 
df_transformed = df_transformed.values

In [None]:
df_additional = np.concatenate(additional_features, axis=1).transpose()

In [None]:
df_transformed.to_csv('C:/PHM/data_test_l10_transformed.csv', index=False)
df_additional.to_csv('C:/PHM/data_test_l10_additions.csv', index=False)

# Predict data

# Calculate result and probability

In [None]:
unique_values = np.unique(index)

# Create an empty array to store the averages
averages = np.zeros_like(unique_values, dtype=float)

# Calculate the average for each unique value
for i, value in enumerate(unique_values):
    averages[i] = np.mean(y_pred[index == value])

print(averages)

In [17]:
index = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 4]
y_pred = [-1, 0, 4, 0, 1, 8, -1, 0, 1, 1, 7, -1, 10, 120, -1, 1]

# Create a dictionary to store the values for each column
columns_dict = {i: [] for i in range(10)}

current_index = None
row_values = [0] * 10

# Iterate over the index and y_pred arrays
for idx, pred in zip(index, y_pred):
    # Check if the index has changed
    if idx != current_index:
        # Add the row values to the respective columns in the dictionary
        for i, value in enumerate(row_values):
            columns_dict[i].append(value)
        
        # Reset the row values
        row_values = [0] * 10
        current_index = idx
    
    # Increment the column based on the value of y_pred
    if abs(pred) > 0:
        column_index = min(abs(pred), 10) - 1
        row_values[column_index] += 1

# Add the last row values to the respective columns in the dictionary
for i, value in enumerate(row_values):
    columns_dict[i].append(value)

# Create the dataframe from the dictionary
df = pd.DataFrame(columns_dict)

# Display the dataframe
print(df)

   0  1  2  3  4  5  6  7  8  9
0  0  0  0  0  0  0  0  0  0  0
1  1  0  0  1  0  0  0  0  0  0
2  2  0  0  0  0  0  0  1  0  0
3  1  0  0  0  0  0  0  0  0  0
4  2  0  0  0  0  0  1  0  0  2
5  2  0  0  0  0  0  0  0  0  0


In [18]:
df = df.drop(0)

# Normalize the rows and round the values to one decimal place
df = df.div(df.sum(axis=1), axis=0).round(1)

# Display the normalized dataframe
print(df)

     0    1    2    3    4    5    6    7    8    9
1  0.5  0.0  0.0  0.5  0.0  0.0  0.0  0.0  0.0  0.0
2  0.7  0.0  0.0  0.0  0.0  0.0  0.0  0.3  0.0  0.0
3  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
4  0.4  0.0  0.0  0.0  0.0  0.0  0.2  0.0  0.0  0.4
5  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0


In [19]:
speed = np.array([300,1800,1600,200,2400])

In [22]:
conditions = ((df >= 0.5).sum(axis=1) == 0) | (speed[df.index-1] == 1600) | (speed[df.index-1] == 2400)

# Set the values for the new column based on the conditions
df['new_column'] = np.where(conditions, 0, 1)

# Display the updated dataframe
print(df)

     0    1    2    3    4    5    6    7    8    9  new_column
1  0.5  0.0  0.0  0.5  0.0  0.0  0.0  0.0  0.0  0.0           1
2  0.7  0.0  0.0  0.0  0.0  0.0  0.0  0.3  0.0  0.0           1
3  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0           0
4  0.4  0.0  0.0  0.0  0.0  0.0  0.2  0.0  0.0  0.4           0
5  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0           0


In [None]:
import pandas as pd
import numpy as np

# Assuming you have the arrays 'speed', 'torque', 'y', and 'y_pred'

# Round 'y_pred' to the nearest whole number and clip between 0 and 10
y_pred_rounded = np.clip(np.round(y_pred), 0, 10)

# Create the dataframe
df = pd.DataFrame({'speed': speed, 'torque': torque, 'y': y, 'y_pred': y_pred_rounded})

# Display the dataframe
print(df)
