# Kaggle Regression



### Libraries

In [24]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import warnings
from scipy.signal import butter, filtfilt

from utility import read_all_csvs_one_test
from utility import run_cv_one_motor
from utility import read_all_test_data_from_path
from utility import read_all_test_data_from_path, show_reg_result,extract_selected_feature, prepare_sliding_window, FaultDetectReg

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import f_regression


### Read data and pre_processing

In [31]:
n_int = 20

# Subfunction for data preprocessing.
def pre_processing(df: pd.DataFrame):
    ''' ### Description
    Preprocess the data:
    - remove outliers
    - add new features about the difference between the current and previous n data point.
    '''
    
    # Function to design a Butterworth low-pass filter
    def butter_lowpass(cutoff, fs, order=5):
        nyquist = 0.5 * fs
        normal_cutoff = cutoff / nyquist
        b, a = butter(order, normal_cutoff, btype='low', analog=False)
        return b, a


    # Function to apply the Butterworth low-pass filter
    def lowpass_filter(data, cutoff_freq, sampling_freq, order=5):
        b, a = butter_lowpass(cutoff_freq, sampling_freq, order=order)
        filtered_data = filtfilt(b, a, data)
        return filtered_data


    # Set parameters for the low-pass filter
    cutoff_frequency = .8  # Adjust as needed
    sampling_frequency = 10  # Assuming your data is evenly spaced in time


    def customized_outlier_removal(df: pd.DataFrame):
        ''' # Description
        Remove outliers from the dataframe based on defined valid ranges. 
        Define a valid range of temperature and voltage. 
        Use ffil function to replace the invalid measurement with the previous value.
        '''
        df['position'] = df['position'].where(df['position'] <= 1000, np.nan)
        df['position'] = df['position'].where(df['position'] >= 0, np.nan)
        df['position'] = df['position'].ffill()
        df['position'] = lowpass_filter(df['position'], cutoff_frequency, sampling_frequency)
        df['position'] = df['position'].rolling(window=20, min_periods=1).mean()
        df['position'] = df['position'].round()

        df['temperature'] = df['temperature'].where(df['temperature'] <= 100, np.nan)
        df['temperature'] = df['temperature'].where(df['temperature'] >= 0, np.nan)
        df['temperature'] = df['temperature'].rolling(window=20, min_periods=1).mean()

        # Make sure that the difference between the current and previous temperature cannot be too large.
        # Define your threshold
        threshold = 5
        # Shift the 'temperature' column by one row to get the previous temperature
        prev_tmp = df['temperature'].shift(1)
        # Calculate the absolute difference between current and previous temperature
        temp_diff = np.abs(df['temperature'] - prev_tmp)
        # Set the temperature to NaN where the difference is larger than the threshold
        df.loc[temp_diff > threshold, 'temperature'] = np.nan
        df['temperature'] = df['temperature'].ffill()

        df['voltage'] = df['voltage'].where(df['voltage'] >= 6000, np.nan)
        df['voltage'] = df['voltage'].where(df['voltage'] <= 8000, np.nan)
        df['voltage'] = df['voltage'].ffill()
        df['voltage'] = lowpass_filter(df['voltage'], cutoff_frequency, sampling_frequency)
        df['voltage'] = df['voltage'].rolling(window=5, min_periods=1).mean()  

    # Start processing.
    customized_outlier_removal(df)

base_dictionary = '../../dataset/training_data/'
df_data = read_all_test_data_from_path(base_dictionary, pre_processing, is_plot=False)

In [None]:
# Pre-train the model.
# Get all the normal data.
normal_test_id = ['20240105_164214', 
    '20240105_165300', 
    '20240105_165972', 
    '20240320_152031', 
    '20240320_153841', 
    '20240320_155664', 
    '20240321_122650', 
    '20240325_135213',
    '20240325_152902', 
    '20240426_141190', 
    '20240426_141532', 
    '20240426_141602', 
    '20240426_141726', 
    '20240426_141938', 
    '20240426_141980', 
    '20240503_164435']

df_tr = df_data[df_data['test_condition'].isin(normal_test_id)]

Test data

In [None]:
test_id = [
    '20240325_155003',
    '20240425_093699',
    '20240425_094425',
    '20240426_140055',
    '20240503_163963',
    '20240503_164675',
    '20240503_165189'
]
df_test = df_data[df_data['test_condition'].isin(test_id)]

### Feature Selection

In [12]:
feature_list_all_0 = ['time',
                 'data_motor_2_position', 
                'data_motor_3_position', 
                'data_motor_4_position', 'data_motor_3_temperature',
                  'data_motor_6_temperature']

feature_list_all = ['time', 'data_motor_1_position', 'data_motor_1_temperature','data_motor_1_voltage',
       'data_motor_1_temperature_diff', 'data_motor_1_voltage_diff','data_motor_1_position_diff', 
       'data_motor_2_position','data_motor_2_temperature', 'data_motor_2_voltage', 
       'data_motor_2_temperature_diff', 'data_motor_2_voltage_diff', 'data_motor_2_position_diff',
       'data_motor_3_position', 'data_motor_3_temperature','data_motor_3_voltage',
       'data_motor_3_temperature_diff', 'data_motor_3_voltage_diff','data_motor_3_position_diff', 
       'data_motor_4_position','data_motor_4_temperature', 'data_motor_4_voltage',
       'data_motor_4_temperature_diff', 'data_motor_4_voltage_diff', 'data_motor_4_position_diff',
       'data_motor_5_position', 'data_motor_5_temperature','data_motor_5_voltage',
       'data_motor_5_temperature_diff', 'data_motor_5_voltage_diff','data_motor_5_position_diff', 
       'data_motor_6_position', 'data_motor_6_temperature', 'data_motor_6_voltage',
       'data_motor_6_temperature_diff', 'data_motor_6_voltage_diff', 'data_motor_6_position_diff']

feature_list_all_2 = ['time','data_motor_1_position', 'data_motor_1_temperature',
                'data_motor_2_position', 'data_motor_2_temperature',
                'data_motor_3_position',
                'data_motor_4_temperature',
                'data_motor_5_position', 'data_motor_5_temperature', 
                'data_motor_6_position', 'data_motor_6_temperature', 'data_motor_6_voltage','data_motor_6_temperature_diff', 'data_motor_6_voltage_diff', 'data_motor_6_position_diff']

feature_list_all_3 = ['time','data_motor_1_position', 'data_motor_1_temperature',
                'data_motor_2_position', 'data_motor_2_temperature',
                'data_motor_3_position','data_motor_3_temperature',
                'data_motor_4_temperature','data_motor_4_position',
                'data_motor_5_position', 'data_motor_5_temperature', 'data_motor_6_temperature', 'data_motor_6_voltage']

selected_features= ['time','data_motor_1_position',  'data_motor_1_temperature', 
                    'data_motor_2_position',  
                    'data_motor_3_position',  
                    'data_motor_4_position', 'data_motor_4_temperature',
                    'data_motor_5_position', 'data_motor_5_temperature',
                    'data_motor_6_position', 'data_motor_6_temperature', 
                    'data_motor_1_voltage']

# Motor 6

In [27]:
def evaluate_model(name, steps,param_grid, features, window_size, sample_step, prediction_lead_time, threshold, abnormal_limit, n_fold=7):
    print(f'Running model: {name}')
    
    x_tr_org, y_temp_tr_org = extract_selected_feature(df_data=df_tr, feature_list=features, motor_idx=6, mdl_type='reg')
    
    x_tr, y_temp_tr = prepare_sliding_window(df_x=x_tr_org, y=y_temp_tr_org, window_size=window_size, sample_step=sample_step, prediction_lead_time=prediction_lead_time, mdl_type='reg')
    
    pipeline = Pipeline(steps)
    grid_search = GridSearchCV(pipeline, param_grid, cv=n_fold, scoring='f1', n_jobs=-1)
    
    grid_search.fit(x_tr, y_temp_tr)
    best_model = grid_search.best_estimator_
    
    #mdl = Pipeline(steps).fit(x_tr, y_temp_tr)
    
    # Define the fault detector
    detector_reg = FaultDetectReg(reg_mdl=best_model, threshold=threshold, abnormal_limit=abnormal_limit, window_size=window_size, sample_step=sample_step, pred_lead_time=prediction_lead_time)
    
    # # Run cross validation
    n_fold = 7
    _, y_label_test_org = extract_selected_feature(df_data=df_test, feature_list=features, motor_idx=6, mdl_type='clf')
    
    # test data
    x_test_org, y_temp_test_org = extract_selected_feature(df_data=df_test, feature_list=features, motor_idx=6, mdl_type='reg')
    
    print(f'Best parameters for {name}: {grid_search.best_params_}')
    
    motor_idx = 6
    print(f'Model for motor {motor_idx}:')
    
    # Run cross validation.
    df_perf = detector_reg.run_cross_val(df_x=x_test_org, y_label=y_label_test_org, y_response=y_temp_test_org, 
                                        n_fold=n_fold,single_run_result=False)
    
    print(f'{name} performance:\n{df_perf}\n')
    print('Mean performance metric and standard error:')
    for metric, error in zip(df_perf.mean(), df_perf.std()):
        print(f'{metric:.4f} +- {error:.4f}')
    print('\n')
    return df_perf

### Fault detection based on regression model 

In [28]:
# Enrich the features based on the sliding window.
window_size = 80
sample_step = 60
prediction_lead_time = 5 
threshold = .5
abnormal_limit = 3

#param_grids = {
    #'Linear Regression': [{}],  # Note: GridSearchCV requires at least one parameter grid
    #'Ridge Regression': [{'regressor__alpha': np.logspace(-7, 1, num=100)}],# always choose the lowest value
    #'Lasso Regression': [{'regressor__alpha': np.logspace(-7, 1, num=100)}],# always choose the lowest value
    #'ElasticNet Regression': [{'regressor__alpha': np.logspace(-7, 1, num=100), 'regressor__l1_ratio': np.logspace(-7, 1, num=100)}], # always choose the lowest value
    #'Decision Tree Regression': [{'regressor__max_depth': [2,3,4], 'regressor__min_samples_split': [2,3,4]}] # always choose the lowest value
#}

#fix parameters to run faster, since we already know the best chosen parameters

param_grids = {
    'Linear Regression': [{}],  # Note: GridSearchCV requires at least one parameter grid
    'Ridge Regression': [{'regressor__alpha': [0.00001]}],# always choose the lowest value
    'Lasso Regression': [{'regressor__alpha': [0.00001]}],# always choose the lowest value
    'ElasticNet Regression': [{'regressor__alpha': [0.00001], 'regressor__l1_ratio': [0.00001]}], # always choose the lowest value
    'Decision Tree Regression': [{'regressor__max_depth': [2], 'regressor__min_samples_split': [2]}] # always choose the lowest value
}

results = []
models = [
    ('Linear Regression', [('Normalizer', MinMaxScaler()), ('regressor', LinearRegression())]),
    ('Ridge Regression', [('Normalizer', MinMaxScaler()), ('regressor', Ridge())]),
    ('Lasso Regression', [('Normalizer', MinMaxScaler()), ('regressor', Lasso())]),
    ('ElasticNet Regression', [('Normalizer', MinMaxScaler()), ('regressor', ElasticNet())]),
    ('Decision Tree Regression', [('regressor', DecisionTreeRegressor())])
]

for model_name, model_steps in models:
    param_grid = param_grids[model_name]
    df_perf = evaluate_model(model_name, model_steps,param_grid, feature_list_all_0, window_size, sample_step, prediction_lead_time, threshold, abnormal_limit)
    mean_perf = df_perf.mean()
    results.append((model_name, mean_perf['Accuracy'], mean_perf['Precision'], mean_perf['Recall'], mean_perf['F1 score']))

table_md = "| Model   | Accuracy | Precision | Recall | F1   |\n"
table_md += "|---------|----------|-----------|--------|------|\n"
for result in results:
    model_name, accuracy, precision, recall, f1 = result
    table_md += f"| {model_name} | {accuracy:.2f} | {precision:.2f} | {recall:.2f} | {f1:.2f} |\n"

print(table_md)

Running model: Linear Regression
Best parameters for Linear Regression: {}
Model for motor 6:


100%|██████████| 1/1 [00:04<00:00,  4.83s/it]
100%|██████████| 1/1 [00:00<00:00, 13.06it/s]
100%|██████████| 1/1 [00:00<00:00,  4.28it/s]
100%|██████████| 1/1 [00:00<00:00,  1.88it/s]
100%|██████████| 1/1 [00:00<00:00,  1.89it/s]
100%|██████████| 1/1 [00:00<00:00,  4.02it/s]
100%|██████████| 1/1 [00:00<00:00,  3.00it/s]


Linear Regression performance:
   Accuracy  Precision    Recall  F1 score
0  0.883870   0.416274  0.570275  0.481254
1  0.787879   0.781250  0.914634  0.842697
2  0.885033   0.895833  0.666667  0.764444
3  0.971429   1.000000  0.571429  0.727273
4  0.894286   0.847059  0.829971  0.838428
5  0.838000   0.958621  0.868750  0.911475
6  0.525074   0.319079  0.457547  0.375969

Mean performance metric and standard error:
0.8265 +- 0.1442
0.7454 +- 0.2691
0.6970 +- 0.1754
0.7059 +- 0.2007


Running model: Ridge Regression
Best parameters for Ridge Regression: {'regressor__alpha': 1e-05}
Model for motor 6:


100%|██████████| 1/1 [00:05<00:00,  5.14s/it]
100%|██████████| 1/1 [00:00<00:00, 11.68it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,  1.83it/s]
100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
100%|██████████| 1/1 [00:00<00:00,  3.99it/s]
100%|██████████| 1/1 [00:00<00:00,  2.84it/s]


Ridge Regression performance:
   Accuracy  Precision    Recall  F1 score
0  0.884786   0.419240  0.570275  0.483231
1  0.787879   0.781250  0.914634  0.842697
2  0.885033   0.895833  0.666667  0.764444
3  0.971429   1.000000  0.571429  0.727273
4  0.895238   0.849558  0.829971  0.839650
5  0.838000   0.958621  0.868750  0.911475
6  0.525074   0.319079  0.457547  0.375969

Mean performance metric and standard error:
0.8268 +- 0.1444
0.7462 +- 0.2687
0.6970 +- 0.1754
0.7064 +- 0.2005


Running model: Lasso Regression
Best parameters for Lasso Regression: {'regressor__alpha': 1e-05}
Model for motor 6:


100%|██████████| 1/1 [00:04<00:00,  4.98s/it]
100%|██████████| 1/1 [00:00<00:00, 14.94it/s]
100%|██████████| 1/1 [00:00<00:00,  4.28it/s]
100%|██████████| 1/1 [00:00<00:00,  1.72it/s]
100%|██████████| 1/1 [00:00<00:00,  1.58it/s]
100%|██████████| 1/1 [00:00<00:00,  3.54it/s]
100%|██████████| 1/1 [00:00<00:00,  2.57it/s]


Lasso Regression performance:
   Accuracy  Precision    Recall  F1 score
0  0.889669   0.432468  0.537964  0.479482
1  0.780303   0.778947  0.902439  0.836158
2  0.882863   0.894737  0.658915  0.758929
3  0.967196   0.944444  0.539683  0.686869
4  0.871429   0.789617  0.832853  0.810659
5  0.828000   0.960280  0.856250  0.905286
6  0.532448   0.324415  0.457547  0.379648

Mean performance metric and standard error:
0.8217 +- 0.1399
0.7321 +- 0.2533
0.6837 +- 0.1796
0.6939 +- 0.1947


Running model: ElasticNet Regression
Best parameters for ElasticNet Regression: {'regressor__alpha': 1e-05, 'regressor__l1_ratio': 1e-05}
Model for motor 6:


100%|██████████| 1/1 [00:04<00:00,  4.77s/it]
100%|██████████| 1/1 [00:00<00:00, 11.96it/s]
100%|██████████| 1/1 [00:00<00:00,  3.75it/s]
100%|██████████| 1/1 [00:00<00:00,  1.94it/s]
100%|██████████| 1/1 [00:00<00:00,  1.71it/s]
100%|██████████| 1/1 [00:00<00:00,  3.75it/s]
100%|██████████| 1/1 [00:00<00:00,  2.73it/s]


ElasticNet Regression performance:
   Accuracy  Precision    Recall  F1 score
0  0.888753   0.429124  0.537964  0.477419
1  0.780303   0.778947  0.902439  0.836158
2  0.882863   0.894737  0.658915  0.758929
3  0.967196   0.944444  0.539683  0.686869
4  0.871429   0.789617  0.832853  0.810659
5  0.828000   0.960280  0.856250  0.905286
6  0.532448   0.324415  0.457547  0.379648

Mean performance metric and standard error:
0.8216 +- 0.1398
0.7317 +- 0.2540
0.6837 +- 0.1796
0.6936 +- 0.1951


Running model: Decision Tree Regression
Best parameters for Decision Tree Regression: {'regressor__max_depth': 2, 'regressor__min_samples_split': 2}
Model for motor 6:


100%|██████████| 1/1 [00:04<00:00,  4.08s/it]
100%|██████████| 1/1 [00:00<00:00, 19.98it/s]
100%|██████████| 1/1 [00:00<00:00,  4.62it/s]
100%|██████████| 1/1 [00:00<00:00,  2.14it/s]
100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
100%|██████████| 1/1 [00:00<00:00,  3.72it/s]
100%|██████████| 1/1 [00:00<00:00,  2.72it/s]

Decision Tree Regression performance:
   Accuracy  Precision    Recall  F1 score
0  0.524035   0.165597  1.000000  0.284140
1  0.765152   0.774194  0.878049  0.822857
2  0.882863   1.000000  0.581395  0.735294
3  0.961905   1.000000  0.428571  0.600000
4  0.840000   0.793443  0.697406  0.742331
5  0.746000   0.951407  0.775000  0.854191
6  0.328909   0.284192  0.754717  0.412903

Mean performance metric and standard error:
0.7213 +- 0.2212
0.7098 +- 0.3453
0.7307 +- 0.1877
0.6360 +- 0.2155


| Model   | Accuracy | Precision | Recall | F1   |
|---------|----------|-----------|--------|------|
| Linear Regression | 0.83 | 0.75 | 0.70 | 0.71 |
| Ridge Regression | 0.83 | 0.75 | 0.70 | 0.71 |
| Lasso Regression | 0.82 | 0.73 | 0.68 | 0.69 |
| ElasticNet Regression | 0.82 | 0.73 | 0.68 | 0.69 |
| Decision Tree Regression | 0.72 | 0.71 | 0.73 | 0.64 |






Problem : we have to return the y_pred in the run_cross_val in utility.py