In [20]:
import pandas as pd
import numpy as np

## Funções em comum

In [21]:
def simplify_target(df):
    df['failure'] = np.where((df['message0418DAA-1'] == 1) | (df['message0422DAA-1'] == 1), 1, 0)
    df = df.drop(['message0418DAA-1', 'message0422DAA-1'], axis=1)
    return df

In [22]:
def delete_constant_columns(df):
    constant_columns = []

    for column in df.columns:
        if len(df[column].unique()) == 1:
            constant_columns.append(column)

    if constant_columns:
        df.drop(columns=constant_columns, inplace=True)
    
    return df

In [23]:
def drop_columns_with_2_unique_values(df):
    for column in df.columns:
        if column not in ['message0418DAA-1', 'message0422DAA-1', 'ds', 'failure']:
            unique_values = df[column].nunique()
            if unique_values == 2:
                df.drop(column, axis=1, inplace=True)
    return df

In [25]:
def fill_nan_values(df):
    df.fillna(method='ffill', inplace=True)
    return df


In [26]:
from sklearn.preprocessing import MinMaxScaler

def normalize(df):
    columns_to_scale = [col for col in df.columns if col != 'recording_time']
    scaler = MinMaxScaler()
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    return df

In [102]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def perform_pca(data, target_column_name, n_components=10):
    """
    Perform PCA (Principal Component Analysis) on the input data.

    Parameters:
    - data (DataFrame): The input DataFrame containing features and the target variable.
    - target_column_name (str): The name of the target column.
    - n_components (int): The number of principal components to retain (default is 10).

    Returns:
    - principal_df (DataFrame): DataFrame containing the principal components.
    - explained_variances (list): List of explained variances for each principal component.
    - loadings_df (DataFrame): DataFrame containing feature loadings on each principal component.
    """

    # Split the data into features and target
    X = data.drop(columns=[target_column_name])

    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize PCA with the specified number of components
    pca = PCA(n_components=n_components)

    # Fit PCA on the scaled data
    principal_components = pca.fit_transform(X_scaled)

    # Create a DataFrame to store the principal components
    principal_df = pd.DataFrame(data=principal_components, columns=[f'PC{i + 1}' for i in range(n_components)])

    # Get the explained variances
    explained_variances = pca.explained_variance_ratio_

    # Create a DataFrame to store the feature loadings
    loadings = pca.components_.T  # Transpose to match features with components
    loadings_df = pd.DataFrame(data=loadings, columns=[f'PC{i + 1}' for i in range(n_components)], index=X.columns)
    top_component_indices = (-explained_variances).argsort()[:n_components]

    # Create a list of column names associated with the top components
    top_component_column_names = [loadings_df.iloc[:, i].idxmax() for i in top_component_indices]

    return top_component_column_names, explained_variances

### Time series

In [104]:
import datetime

def create_timestamp(df):
    start_time = datetime.datetime(2023, 1, 1, 0, 0)
    df['ds'] = [start_time + datetime.timedelta(hours=i) for i in range(len(df))]
    return df

In [105]:
def create_hour_column(df):
    df['hour'] = df.index - df.index[0]
    return df

In [111]:
time_series = pd.read_csv('aggregated_time_series.csv')

simplified_df = simplify_target(time_series)
dropped_constant_columns = delete_constant_columns(simplified_df)
dropped_columns_with_2_unique_values = drop_columns_with_2_unique_values(dropped_constant_columns)
filled = fill_nan_values(dropped_columns_with_2_unique_values)
normalized = normalize(filled)

In [112]:
with_timestamp = create_timestamp(normalized)
with_hour = create_hour_column(with_timestamp)
feature_importance = with_hour.copy()
feature_importance = feature_importance.drop('ds', axis=1)
columns, explained_variances = perform_pca(feature_importance, 'failure', n_components=5)


In [117]:
time_series = with_hour[['ds', 'hour', 'failure'] + columns]
time_series.to_csv('time_series_data.csv', index=False)

## Classifier

In [118]:
from imblearn.over_sampling import SMOTE

def oversample(df):
    X = df.drop('failure', axis=1)
    y = df['failure']

    smote = SMOTE(sampling_strategy='auto', random_state=42)

    X_resampled, y_resampled = smote.fit_resample(X, y)

    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled['failure'] = y_resampled

    class_distribution_original = df['failure'].value_counts()
    class_distribution_resampled = df_resampled['failure'].value_counts()

    print("Class Distribution Before Oversampling:")
    print(class_distribution_original)

    print("\nClass Distribution After Oversampling:")
    print(class_distribution_resampled)

    return df_resampled

In [120]:
classifier_data = pd.read_csv('aggregated_classifier.csv')
simplified_df = simplify_target(classifier_data)
dropped_constant_columns = delete_constant_columns(simplified_df)
dropped_columns_with_2_unique_values = drop_columns_with_2_unique_values(dropped_constant_columns)
filled = fill_nan_values(dropped_columns_with_2_unique_values)
normalized = normalize(filled)
oversampled = oversample(normalized)
feature_importance = oversampled.copy()
columns, explained_variances = perform_pca(feature_importance, 'failure', n_components=5)

Class Distribution Before Oversampling:
0.0    1562
1.0      18
Name: failure, dtype: int64

Class Distribution After Oversampling:
0.0    1562
1.0    1562
Name: failure, dtype: int64


In [122]:
classifier_data = oversampled[['failure'] + columns]
classifier_data.to_csv('classifier_data.csv', index=False)