<center> <h1> Seleção de parquets correspondentes a trecho e anterioridade a falhas e a não falhas</h1> </center>

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import random


def generate_random_data(file_name, total_lines, num_columns, lines_per_file):
    # Create a list of column names
    column_names = []
    for i in range(1, num_columns + 1):
        column_names.append(f"sensor_{i}")
    
    # Initialize a counter for file names
    file_counter = 1
    
    # Generate random lines and save them into separate Parquet files
    current_line = 0
    while current_line < total_lines:
        # Determine how many lines to generate for this file
        lines_to_generate = min(lines_per_file, total_lines - current_line)
        
        # Create a dictionary for data
        data = {}
        for col in column_names:
            data[col] = []
            for _ in range(lines_to_generate):
                data[col].append(random.random())
        
        # Create a dataframe with random data
        df = pd.DataFrame(data)
        
        # Save the dataframe as a Parquet file
        table = pa.Table.from_pandas(df)
        filename = f'{file_name}_{file_counter}.parquet'
        pq.write_table(table, filename)
        
        # Increment the file counter and update the current_line count
        file_counter += 1
        current_line += lines_to_generate
    
    print(f"{file_counter - 1} Parquet files have been generated.")

    return

### Geração de parquets de falhas correspondentes ao período de uma semana

In [2]:
failure_file_name = "falha"
failure_total_lines = 3_000_000 #30_000_000 for all the files that are in our Drive
failure_num_columns = 8
failure_lines_per_file = 1_000_000  # You can adjust this as needed

#generate_random_data(failure_file_name, failure_total_lines, failure_num_columns, failure_lines_per_file)

### Geração de parquets de não falha correspondentes ao período de uma semana

In [3]:
non_failure_file_name = "nao_falha"
non_failure_total_lines = 3_000_000 #30_000_000 for all the files that are in our Drive
non_failure_num_columns = 8
non_failure_lines_per_file = 1_000_000  # You can adjust this as needed

#generate_random_data(non_failure_file_name, non_failure_total_lines, non_failure_num_columns, non_failure_lines_per_file)

### Implementação da função de média móvel

In [4]:
import pandas as pd

def moving_average(df, column_names, window_size):
    """
    Calculate the moving average for specified columns in a DataFrame.

    Parameters:
    - df: pandas DataFrame
        The input DataFrame.
    - column_names: list of str
        The column names for which moving averages will be calculated.
    - window_size: int
        The size of the moving average window.

    Returns:
    - pandas DataFrame
        A new DataFrame with the same columns as the input, including moving average columns.
    """
    result_df =  pd.DataFrame()

    # Calculate moving averages for each specified column
    for column in column_names:
        result_df[f'{column}_moving_avg'] = df[column].rolling(window=window_size).mean()

    result_df.dropna(inplace=True)
    result_df.reset_index(drop=True, inplace=True)

    return result_df

### Verificação de dimensão do dataframe antes da aplicação da média móvel

In [5]:
df_cru_falha = pd.read_parquet("falha_1.parquet")
df_cru_falha.head()

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8
0,0.180437,0.543236,0.058747,0.641238,0.456573,0.152388,0.398071,0.505388
1,0.797652,0.16619,0.183609,0.070591,0.814511,0.439338,0.587446,0.890407
2,0.988906,0.737004,0.44212,0.950468,0.928465,0.945063,0.172777,0.839158
3,0.756042,0.967428,0.315361,0.089875,0.51746,0.68376,0.052969,0.112268
4,0.120998,0.035749,0.973682,0.457801,0.217553,0.741013,0.748934,0.759808


In [6]:
df_cru_falha.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   sensor_1  1000000 non-null  float64
 1   sensor_2  1000000 non-null  float64
 2   sensor_3  1000000 non-null  float64
 3   sensor_4  1000000 non-null  float64
 4   sensor_5  1000000 non-null  float64
 5   sensor_6  1000000 non-null  float64
 6   sensor_7  1000000 non-null  float64
 7   sensor_8  1000000 non-null  float64
dtypes: float64(8)
memory usage: 61.0 MB


### Função que recebe um conjunto de parquets e retorna um dataframe contendo a média móvel de todos eles concatenados

In [7]:
def dataframe_moving_average(file_names, column_names, window_size):
    
    moving_average_total_df = pd.DataFrame()
    
    for file_name in file_names:
        
        temp_dataframe = pd.read_parquet(file_name)
    
        moving_average_temp_dataframe = moving_average(temp_dataframe, column_names, window_size)
    
        print(file_name)
        print(moving_average_temp_dataframe.head(3))
        print("-------------------------------------------------------------------------------\n")
    
        moving_average_total_df = pd.concat([moving_average_total_df, moving_average_temp_dataframe], ignore_index=True)
    
        return moving_average_total_df

In [8]:
failure_files = ["falha_1.parquet", "falha_2.parquet", "falha_3.parquet"]
failure_column_names = ["sensor_1",	"sensor_2",	"sensor_3",	"sensor_4",	"sensor_5",	"sensor_6",	"sensor_7",	"sensor_8"]
window_size = 500_000

### Aplicação da função de média móvel para todos os parquets de falha

In [9]:
total_failure_moving_average = dataframe_moving_average(failure_files, failure_column_names, window_size)

falha_1.parquet
   sensor_1_moving_avg  sensor_2_moving_avg  sensor_3_moving_avg  \
0             0.499472             0.500254             0.500461   
1             0.499472             0.500255             0.500461   
2             0.499472             0.500256             0.500462   

   sensor_4_moving_avg  sensor_5_moving_avg  sensor_6_moving_avg  \
0             0.499855             0.500409             0.499935   
1             0.499855             0.500408             0.499936   
2             0.499857             0.500408             0.499937   

   sensor_7_moving_avg  sensor_8_moving_avg  
0             0.499887             0.499807  
1             0.499888             0.499807  
2             0.499887             0.499807  
-------------------------------------------------------------------------------



In [10]:
total_failure_moving_average.head(3)

Unnamed: 0,sensor_1_moving_avg,sensor_2_moving_avg,sensor_3_moving_avg,sensor_4_moving_avg,sensor_5_moving_avg,sensor_6_moving_avg,sensor_7_moving_avg,sensor_8_moving_avg
0,0.499472,0.500254,0.500461,0.499855,0.500409,0.499935,0.499887,0.499807
1,0.499472,0.500255,0.500461,0.499855,0.500408,0.499936,0.499888,0.499807
2,0.499472,0.500256,0.500462,0.499857,0.500408,0.499937,0.499887,0.499807


In [11]:
total_failure_moving_average.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500001 entries, 0 to 500000
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   sensor_1_moving_avg  500001 non-null  float64
 1   sensor_2_moving_avg  500001 non-null  float64
 2   sensor_3_moving_avg  500001 non-null  float64
 3   sensor_4_moving_avg  500001 non-null  float64
 4   sensor_5_moving_avg  500001 non-null  float64
 5   sensor_6_moving_avg  500001 non-null  float64
 6   sensor_7_moving_avg  500001 non-null  float64
 7   sensor_8_moving_avg  500001 non-null  float64
dtypes: float64(8)
memory usage: 30.5 MB


### Aplicação da função de média móvel para todos os parquets de não falha

In [12]:
non_failure_files = ["nao_falha_1.parquet", "nao_falha_2.parquet", "nao_falha_3.parquet"]
non_failure_column_names = ["sensor_1",	"sensor_2",	"sensor_3",	"sensor_4",	"sensor_5",	"sensor_6",	"sensor_7",	"sensor_8"]
window_size = 500_000

In [13]:
total_non_failure_moving_average = dataframe_moving_average(non_failure_files, non_failure_column_names, window_size)

nao_falha_1.parquet
   sensor_1_moving_avg  sensor_2_moving_avg  sensor_3_moving_avg  \
0             0.500135             0.499956             0.500434   
1             0.500134             0.499955             0.500433   
2             0.500132             0.499957             0.500433   

   sensor_4_moving_avg  sensor_5_moving_avg  sensor_6_moving_avg  \
0             0.499399             0.500705             0.499995   
1             0.499400             0.500705             0.499994   
2             0.499401             0.500705             0.499996   

   sensor_7_moving_avg  sensor_8_moving_avg  
0             0.499860             0.499485  
1             0.499860             0.499485  
2             0.499859             0.499486  
-------------------------------------------------------------------------------



In [None]:
total_non_failure_moving_average.head(3)

In [None]:
total_non_failure_moving_average.info()

### Identificação dos dataframes com etiquetas de falha (1) e não falha (0)

In [14]:
total_failure_moving_average["label"] = [1]*len(total_non_failure_moving_average)
total_failure_moving_average.head(7)

Unnamed: 0,sensor_1_moving_avg,sensor_2_moving_avg,sensor_3_moving_avg,sensor_4_moving_avg,sensor_5_moving_avg,sensor_6_moving_avg,sensor_7_moving_avg,sensor_8_moving_avg,label
0,0.499472,0.500254,0.500461,0.499855,0.500409,0.499935,0.499887,0.499807,1
1,0.499472,0.500255,0.500461,0.499855,0.500408,0.499936,0.499888,0.499807,1
2,0.499472,0.500256,0.500462,0.499857,0.500408,0.499937,0.499887,0.499807,1
3,0.499471,0.500257,0.500463,0.499855,0.500407,0.499936,0.499887,0.499807,1
4,0.499471,0.500256,0.500464,0.499857,0.500408,0.499936,0.499888,0.499807,1
5,0.499472,0.500257,0.500464,0.499857,0.500409,0.499936,0.499889,0.499808,1
6,0.499472,0.500259,0.500464,0.499857,0.50041,0.499936,0.499889,0.499809,1


In [15]:
total_non_failure_moving_average["label"] = [0]*len(total_non_failure_moving_average)
total_non_failure_moving_average.head(7)

Unnamed: 0,sensor_1_moving_avg,sensor_2_moving_avg,sensor_3_moving_avg,sensor_4_moving_avg,sensor_5_moving_avg,sensor_6_moving_avg,sensor_7_moving_avg,sensor_8_moving_avg,label
0,0.500135,0.499956,0.500434,0.499399,0.500705,0.499995,0.49986,0.499485,0
1,0.500134,0.499955,0.500433,0.4994,0.500705,0.499994,0.49986,0.499485,0
2,0.500132,0.499957,0.500433,0.499401,0.500705,0.499996,0.499859,0.499486,0
3,0.500133,0.499956,0.500433,0.4994,0.500704,0.499996,0.49986,0.499485,0
4,0.500132,0.499956,0.500431,0.4994,0.500703,0.499996,0.499859,0.499486,0
5,0.500131,0.499956,0.500431,0.4994,0.500703,0.499994,0.499859,0.499487,0
6,0.50013,0.499957,0.500433,0.499398,0.500704,0.499995,0.499858,0.499487,0


### Fusão dos dataframes com as médias móveis totais de uma semana de dados coletada uma semana anterior a um evento de falha e o mesmo para não falha

In [16]:
trainning_df = pd.concat([total_failure_moving_average, total_non_failure_moving_average], ignore_index=True)

In [17]:
trainning_df.head(7)

Unnamed: 0,sensor_1_moving_avg,sensor_2_moving_avg,sensor_3_moving_avg,sensor_4_moving_avg,sensor_5_moving_avg,sensor_6_moving_avg,sensor_7_moving_avg,sensor_8_moving_avg,label
0,0.499472,0.500254,0.500461,0.499855,0.500409,0.499935,0.499887,0.499807,1
1,0.499472,0.500255,0.500461,0.499855,0.500408,0.499936,0.499888,0.499807,1
2,0.499472,0.500256,0.500462,0.499857,0.500408,0.499937,0.499887,0.499807,1
3,0.499471,0.500257,0.500463,0.499855,0.500407,0.499936,0.499887,0.499807,1
4,0.499471,0.500256,0.500464,0.499857,0.500408,0.499936,0.499888,0.499807,1
5,0.499472,0.500257,0.500464,0.499857,0.500409,0.499936,0.499889,0.499808,1
6,0.499472,0.500259,0.500464,0.499857,0.50041,0.499936,0.499889,0.499809,1


In [18]:
trainning_df.tail(7)

Unnamed: 0,sensor_1_moving_avg,sensor_2_moving_avg,sensor_3_moving_avg,sensor_4_moving_avg,sensor_5_moving_avg,sensor_6_moving_avg,sensor_7_moving_avg,sensor_8_moving_avg,label
999995,0.500926,0.499974,0.499675,0.500581,0.500241,0.500515,0.499867,0.499606,0
999996,0.500926,0.499975,0.499675,0.500581,0.50024,0.500514,0.499868,0.499606,0
999997,0.500926,0.499975,0.499676,0.50058,0.500241,0.500513,0.499869,0.499607,0
999998,0.500927,0.499975,0.499676,0.50058,0.500242,0.500513,0.499868,0.499606,0
999999,0.500927,0.499974,0.499676,0.500581,0.500242,0.500514,0.499867,0.499606,0
1000000,0.500926,0.499973,0.499675,0.500581,0.500242,0.500513,0.499867,0.499606,0
1000001,0.500927,0.499974,0.499675,0.500581,0.500241,0.500512,0.499866,0.499606,0


### Treinamento do classificador

In [19]:
from pycaret.classification import *

In [20]:
experiment_model_1_week = setup(trainning_df, target='label')

# Compare and evaluate different models
best_model_1_week = compare_models()

Unnamed: 0,Description,Value
0,Session id,3441
1,Target,label
2,Target type,Binary
3,Original data shape,"(1000002, 9)"
4,Transformed data shape,"(1000002, 9)"
5,Transformed train set shape,"(700001, 9)"
6,Transformed test set shape,"(300001, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.875
knn,K Neighbors Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.797
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.48
ridge,Ridge Classifier,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.355
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9.406
qda,Quadratic Discriminant Analysis,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.892
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,42.0
lda,Linear Discriminant Analysis,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.571
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.784
lightgbm,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.968


In [21]:
trainning_results_1_week_df = pull()

In [22]:
final_model_1_week = create_model(best_model_1_week)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
save_model(final_model_1_week, model_name='failure_1_week', model_only=True)

Model Successfully Saved


(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=3441, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'failure_1_week.pkl')