# Kaggle project 2


## Libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import sys
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.exceptions import ConvergenceWarning
import warnings

from scipy.stats import multivariate_normal

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer,accuracy_score ,precision_score, recall_score
from sklearn.metrics import f1_score,confusion_matrix, roc_curve, auc
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline


from functions import *

from utility import read_all_csvs_one_test
from utility import read_all_test_data_from_path
from utility import run_cv_one_motor

### Read data and pre-process

In [3]:
n_int = 20

# Subfunction for data preprocessing.
def pre_processing(df: pd.DataFrame):
    ''' ### Description
    Preprocess the data:
    - remove outliers
    - add new features about the difference between the current and previous n data point.
    '''
    
    def remove_outliers(df: pd.DataFrame):
        ''' # Description
        Remove outliers from the dataframe based on defined valid ranges. 
        Define a valid range of temperature and voltage. 
        Use ffil function to replace the invalid measurement with the previous value.
        '''
        df['temperature'] = df['temperature'].where(df['temperature'] <= 100, np.nan)
        df['temperature'] = df['temperature'].where(df['temperature'] >= 0, np.nan)
        df['temperature'] = df['temperature'].ffill()        

        df['voltage'] = df['voltage'].where(df['voltage'] >= 6000, np.nan)
        df['voltage'] = df['voltage'].where(df['voltage'] <= 9000, np.nan)
        df['voltage'] = df['voltage'].ffill()        

        df['position'] = df['position'].where(df['position'] >= 0, np.nan)
        df['position'] = df['position'].where(df['position'] <= 1000, np.nan)
        df['position'] = df['position'].ffill()


    def cal_diff(df: pd.DataFrame, n_int: int):
        ''' # Description
        Calculate the difference between the current and previous n data point.
        '''
        # Tranform the features relative to the first data point.
        df['temperature'] = df['temperature'] - df['temperature'].iloc[0]
        df['voltage'] = df['voltage'] - df['voltage'].iloc[0]
        df['position'] = df['position'] - df['position'].iloc[0]

        # Calculate the difference between the current and previous n data point.
        df['temperature_diff'] = df['temperature'].diff(n_int)
        df['voltage_diff'] = df['voltage'].diff(n_int)
        df['position_diff'] = df['position'].diff(n_int)   

    # Start processing.
    remove_outliers(df)
    #cal_diff(df, n_int)
    
# Read all the training dataset.
base_dictionary = '../../dataset/training_data/'
df_data = read_all_test_data_from_path(base_dictionary, pre_processing, is_plot=False)

# Smooth the data.

smoothed_data = df_data.copy(deep=True)
smoothed_data.drop(columns=['time','test_condition'], inplace=True)

for i in range(1,7):
    smoothed_data[f'data_motor_{i}_voltage'] = smooth_data_moving_average(smoothed_data[f'data_motor_{i}_voltage'], 10)

In [42]:
'''
test_id = [
    '20240527_094865',
    '20240527_100759',
    '20240527_101627',
    '20240527_102436',
    '20240527_102919',
    '20240527_103311',
    '20240527_103690',
    '20240527_104247'
]
df_test = df_data[df_data['test_condition'].isin(test_id)]
'''

"\ntest_id = [\n    '20240527_094865',\n    '20240527_100759',\n    '20240527_101627',\n    '20240527_102436',\n    '20240527_102919',\n    '20240527_103311',\n    '20240527_103690',\n    '20240527_104247'\n]\ndf_test = df_data[df_data['test_condition'].isin(test_id)]\n"

Read Test data

In [4]:
base_dictionary = '../../dataset/testing_data/'
# Get all the folders in the base_dictionary
path_list = os.listdir(base_dictionary)
# Only keep the folders, not the excel file.
path_list_sorted = sorted(path_list)
path_list = path_list_sorted[:-1]

# Read the data.
df_test = pd.DataFrame()
for tmp_path in path_list:
    path = base_dictionary + tmp_path
    #tmp_df = read_all_csvs_one_test(path, tmp_path, pre_processing)
    
    ### ------------read_all_csvs_one_test --------------
    
    # Get a list of all CSV files in the folder
    csv_files = [file for file in os.listdir(path) if file.endswith('.csv')]

    # Create an empty DataFrame to store the combined data
    combined_df = pd.DataFrame()

    # Iterate over the CSV files in the folder
    for file in csv_files:
        # Construct the full path to each CSV file
        file_path = os.path.join(path, file)

        # Read each CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Drop the time. Will add later.
        df = df.drop(labels=df.columns[0], axis=1)

        # Apply the pre-processing.
        if pre_processing:
            pre_processing(df)

        # Extract the file name (excluding the extension) to use as a prefix
        file_name = os.path.splitext(file)[0]

        # Add a prefix to each column based on the file name
        df = df.add_prefix(f'{file_name}_')

        # Concatenate the current DataFrame with the combined DataFrame
        combined_df = pd.concat([combined_df, df], axis=1)

    # Add time and test condition
    df = pd.read_csv(file_path)
    combined_df = pd.concat([df['time'], combined_df], axis=1)

    # Calculate the time difference since the first row
    time_since_first_row = combined_df['time'] - combined_df['time'].iloc[0]
    # Replace the 'time' column with the time difference
    combined_df['time'] = time_since_first_row

    combined_df.loc[:, 'test_condition'] = tmp_path

    combined_df.drop(columns=label_columns, inplace= True)
    
    # Drop the NaN values, which represents the first n data points in the original dataframe.
    combined_df.dropna(inplace=True)

    tmp_df = combined_df
    
    ### --------------------------------------------
    
    df_test = pd.concat([df_test, tmp_df])
    df_test = df_test.reset_index(drop=True)

# Read the test conditions
df_test_conditions = pd.read_excel(base_dictionary+'Test conditions.xlsx')

# Smooth the data.

df_test.drop(columns=['time','test_condition'], inplace=True)

for i in range(1,7):
    df_test[f'data_motor_{i}_voltage'] = smooth_data_moving_average(df_data[f'data_motor_{i}_voltage'], 10)

### Feature Selection

In [7]:
# Get the features

drop_list1_label1 = ['data_motor_2_voltage', 'data_motor_3_voltage', 'data_motor_4_voltage', 'data_motor_5_voltage', 'data_motor_6_voltage', 'data_motor_6_position']

drop_list2_label1 = ['data_motor_2_voltage', 'data_motor_3_voltage', 'data_motor_4_voltage', 'data_motor_5_voltage', 'data_motor_6_voltage', 'data_motor_6_position','data_motor_2_position','data_motor_3_position','data_motor_4_position','data_motor_5_position','data_motor_2_temperature','data_motor_3_temperature','data_motor_4_temperature','data_motor_6_temperature']

#drop_list1_label1 = ['data_motor_1_voltage','data_motor_2_voltage', 'data_motor_3_voltage', 'data_motor_4_voltage', 'data_motor_5_voltage', 'data_motor_6_voltage', 'data_motor_6_position']

label_columns = ['data_motor_1_label', 'data_motor_2_label', 'data_motor_3_label', 'data_motor_4_label', 'data_motor_5_label', 'data_motor_6_label']

X = smoothed_data.drop(columns=label_columns+drop_list1_label1)

### Cross validation and undersampling

In [45]:
def cross_validation_and_undersampling(X,y):
    
    # Initialize KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    for train_index, test_index in kf.split(X):
        X_train_unsampled, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train_unsampled, y_test = y.iloc[train_index], y.iloc[test_index]
        
        ## undersampling 
        
        # Class count
        count_class_0, count_class_1 = y_train_unsampled.value_counts()

        # Separate majority and minority classes
        data_Normal = X_train_unsampled[y_train_unsampled == 0]
        data_Failure = X_train_unsampled[y_train_unsampled == 1]

        # Undersample majority class
        data_Normal_under = data_Normal.sample(count_class_1)
        data_under = pd.concat([data_Normal_under, data_Failure], axis=0)
        
        undersampled_indices = data_under.index

        X_train = X_train_unsampled.loc[undersampled_indices]
        y_train = y_train_unsampled.loc[undersampled_indices]
        
        return X_train, y_train, X_test, y_test

In [46]:
def run_all_motors(label):
    X = smoothed_data.drop(columns=label_columns+drop_list1_label1)
    y = smoothed_data[label]
    
    X_train, y_train, X_test , y_test = cross_validation_and_undersampling(X,y)
    
    
    warnings.filterwarnings('ignore')

    # Initialize models
    models = {
        'Logistic Regression': LogisticRegression(class_weight='balanced'),
        'Decision Tree': DecisionTreeClassifier(class_weight='balanced'),
        'Random Forest': RandomForestClassifier(class_weight='balanced'),
        'Support Vector Machine': SVC(class_weight='balanced'),
        'Gradient Boosting': GradientBoostingClassifier()
    }

    # Dictionary to store model performance metrics
    model_metrics = {}

    # Define hyperparameter grids
    param_grids = {
        'Logistic Regression': {'C': [0.1, 1, 10]},
        'Decision Tree': {'max_depth': [None, 10, 20]},
        'Random Forest': {'n_estimators': [50, 100, 200]},
        'Support Vector Machine': {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']},
        'Gradient Boosting': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.5, 1.0]}
    }

    model_predictions = {}

    # Perform cross-validation, hyperparameter tuning, and evaluation
    for model_name, model in models.items():
        
        # Hyperparameter tuning
        grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring='f1')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        
        # Evaluate on test set
        y_pred = best_model.predict(X_test)
        
        model_predictions[f'y_pred_{model_name.replace(" ", "_")}'] = y_pred
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # Store metrics in the dictionary
        model_metrics[model_name] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1
        }
        
    # Update the summary table with the model performance metrics
    summary_table = "| Model                    | Accuracy | Precision | Recall | F1    |\n"
    summary_table += "|--------------------------|----------|-----------|--------|-------|\n"
    for model_name, metrics in model_metrics.items():
        summary_table += f"| {model_name:25} | {metrics['Accuracy']*100:.2f}%   | {metrics['Precision']*100:.2f}%   | {metrics['Recall']*100:.2f}%  | {metrics['F1']*100:.2f}% |\n"

    print(summary_table)        
    
    return model_predictions

In [28]:
X = smoothed_data.drop(columns=label_columns+drop_list1_label1).values
X_test = df_test.drop(columns=drop_list1_label1).values

def run_all_motors_validation(label):
    y = smoothed_data[label]
    
    warnings.filterwarnings('ignore')

    # Initialize models
    models = {
        'Logistic Regression': LogisticRegression(class_weight='balanced'),
        'Decision Tree': DecisionTreeClassifier(class_weight='balanced'),
        'Random Forest': RandomForestClassifier(class_weight='balanced'),
        'Support Vector Machine': SVC(class_weight='balanced'),
        'Gradient Boosting': GradientBoostingClassifier()
    }

    # Define hyperparameter grids
    param_grids = {
        'Logistic Regression': {'C': [0.1]},
        'Decision Tree': {'max_depth': [10]},
        'Random Forest': {'n_estimators': [50]},
        'Support Vector Machine': {'C': [0.1], 'gamma': ['scale', 'auto']},
        'Gradient Boosting': {'n_estimators': [50], 'learning_rate': [0.5]}
    }

    model_predictions = {}

    # Perform cross-validation, hyperparameter tuning, and evaluation
    for model_name, model in models.items():
        
        pipeline = Pipeline([
            ('scaler', MinMaxScaler()), # Step 1 : Normalization
            ('model', model)
        ])
        param_grid = {f'model__{key}': value for key, value in param_grids[model_name].items()}
        
        # Hyperparameter tuning
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')
        #grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring='f1')
        grid_search.fit(X, y)
        best_model = grid_search.best_estimator_
        
        # Evaluate on test set
        y_pred = best_model.predict(X_test)
        
        model_predictions[f'y_pred_{model_name.replace(" ", "_")}'] = y_pred    
    
    return model_predictions

# Motor 1

In [48]:
model_predictions = run_all_motors('data_motor_1_label')

| Model                    | Accuracy | Precision | Recall | F1    |
|--------------------------|----------|-----------|--------|-------|
| Logistic Regression       | 82.68%   | 17.08%   | 97.87%  | 29.08% |
| Decision Tree             | 99.18%   | 82.25%   | 98.58%  | 89.68% |
| Random Forest             | 99.42%   | 86.69%   | 99.29%  | 92.56% |
| Support Vector Machine    | 95.10%   | 42.28%   | 96.10%  | 58.72% |
| Gradient Boosting         | 99.20%   | 82.35%   | 99.29%  | 90.03% |



In [29]:
model_predictions = run_all_motors_validation('data_motor_1_label')
y_pred1 = model_predictions['y_pred_Gradient_Boosting']

# Motor 2

In [None]:
model_predictions = run_all_motors('data_motor_2_label')

| Model                    | Accuracy | Precision | Recall | F1    |
|--------------------------|----------|-----------|--------|-------|
| Logistic Regression       | 89.33%   | 24.98%   | 96.81%  | 39.71% |
| Decision Tree             | 99.23%   | 82.84%   | 99.29%  | 90.32% |
| Random Forest             | 99.86%   | 96.89%   | 99.29%  | 98.07% |
| Support Vector Machine    | 95.28%   | 43.22%   | 96.10%  | 59.63% |
| Gradient Boosting         | 99.68%   | 92.13%   | 99.65%  | 95.74% |



In [32]:
model_predictions = run_all_motors_validation('data_motor_2_label')
y_pred2 = model_predictions['y_pred_Gradient_Boosting']

# Motor 3

In [None]:
model_predictions = run_all_motors('data_motor_3_label')

| Model                    | Accuracy | Precision | Recall | F1    |
|--------------------------|----------|-----------|--------|-------|
| Logistic Regression       | 85.34%   | 19.50%   | 97.16%  | 32.48% |
| Decision Tree             | 99.46%   | 87.74%   | 98.94%  | 93.00% |
| Random Forest             | 99.88%   | 97.56%   | 99.29%  | 98.42% |
| Support Vector Machine    | 95.46%   | 44.21%   | 96.10%  | 60.56% |
| Gradient Boosting         | 99.83%   | 96.22%   | 99.29%  | 97.73% |



In [33]:
model_predictions = run_all_motors_validation('data_motor_3_label')
y_pred3 = model_predictions['y_pred_Gradient_Boosting']

# Motor 4

In [None]:
model_predictions = run_all_motors('data_motor_4_label')

| Model                    | Accuracy | Precision | Recall | F1    |
|--------------------------|----------|-----------|--------|-------|
| Logistic Regression       | 85.56%   | 19.57%   | 95.74%  | 32.49% |
| Decision Tree             | 99.21%   | 82.79%   | 98.94%  | 90.15% |
| Random Forest             | 99.90%   | 97.90%   | 99.29%  | 98.59% |
| Support Vector Machine    | 95.64%   | 45.21%   | 95.39%  | 61.35% |
| Gradient Boosting         | 99.88%   | 97.56%   | 99.29%  | 98.42% |



In [34]:
model_predictions = run_all_motors_validation('data_motor_4_label')
y_pred4 = model_predictions['y_pred_Gradient_Boosting']

# Motor 5

In [None]:
model_predictions = run_all_motors('data_motor_5_label')

| Model                    | Accuracy | Precision | Recall | F1    |
|--------------------------|----------|-----------|--------|-------|
| Logistic Regression       | 89.52%   | 25.46%   | 97.87%  | 40.41% |
| Decision Tree             | 99.11%   | 81.05%   | 98.58%  | 88.96% |
| Random Forest             | 99.83%   | 96.22%   | 99.29%  | 97.73% |
| Support Vector Machine    | 95.17%   | 42.70%   | 96.45%  | 59.19% |
| Gradient Boosting         | 99.34%   | 84.89%   | 99.65%  | 91.68% |



In [35]:
model_predictions = run_all_motors_validation('data_motor_5_label')
y_pred5 = model_predictions['y_pred_Gradient_Boosting']

# Motor 6

In [None]:
model_predictions = run_all_motors('data_motor_6_label')

| Model                    | Accuracy | Precision | Recall | F1    |
|--------------------------|----------|-----------|--------|-------|
| Logistic Regression       | 83.50%   | 17.66%   | 96.81%  | 29.87% |
| Decision Tree             | 99.42%   | 86.69%   | 99.29%  | 92.56% |
| Random Forest             | 99.86%   | 97.21%   | 98.94%  | 98.07% |
| Support Vector Machine    | 95.83%   | 46.40%   | 96.10%  | 62.59% |
| Gradient Boosting         | 99.87%   | 96.90%   | 99.65%  | 98.25% |



In [36]:
model_predictions = run_all_motors_validation('data_motor_6_label')
y_pred6 = model_predictions['y_pred_Gradient_Boosting']

In [37]:
data = {
    'idx': range(len(y_pred1)),
    'data_motor_1_label': y_pred1,
    'data_motor_2_label': y_pred2,
    'data_motor_3_label': y_pred3,
    'data_motor_4_label': y_pred4,
    'data_motor_5_label': y_pred5,
    'data_motor_6_label': y_pred6
}

df = pd.DataFrame(data)

In [39]:
df.to_csv('motor_predictions.csv', index=False)