# Criador de derivações

In [79]:
import pandas as pd
# from tqdm import tqdm
from time import time
# from timer import timer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

# Caminho para os arquivos treino e teste
path = "hai/hai-22.04/"

# Seleção dos arquivos. Atentar caso apenas arquivo de teste
file1 = "train2.csv"
file2 = "test1.csv"

# Variáveis para nome da derivação
hai = "hai22"
tr = "_te2"
te = "_te4"
mod = "_sfm"
# mod = "_ext-sfm"
# mod = "_rfcv10"

# Caractere de separação do csv (HAI-20:';')
sep = ','

# Lista de períodos para geração de features
periodos = [5,10] 

# Parâmentro de paralelização do scikit-learn
n_jobs = 14

# Parâmetro de aleatorização
random_state = 2023

## Funções

### Função para geração de features (Extended)

In [2]:
def featurize(df,periods):
    resultado = pd.DataFrame()
    for column in df.columns:
        if column in ['Attack','attack','attack_P1','attack_P2','attack_P3']:
            continue
        s = df[column]

        for span in periods:
            minv = s.rolling(window=span,min_periods=span).min()
            maxv = s.rolling(window=span,min_periods=span).max()
            smav = s.rolling(window=span,min_periods=span).mean()
            stdv = s.rolling(window=span,min_periods=span).std()
            aux_min = [] 
            aux_max = []
            aux_sma = []
            aux_std = []
            for i in range(span):
                pmin = s.rolling(window=i+1,min_periods=i+1).min()[:i+1]
                aux_min.append(pmin.tail(1).values[0])
                pmax = s.rolling(window=i+1,min_periods=i+1).max()[:i+1]
                aux_max.append(pmax.tail(1).values[0])
                psma = s.rolling(window=i+1,min_periods=i+1).mean()[:i+1]
                aux_sma.append(psma.tail(1).values[0])
                pstd = s.rolling(window=i+1,min_periods=i+1).std()[:i+1]
                aux_std.append(pstd.tail(1).values[0])
            minv[0:span-1] = pd.Series(aux_min)[0:span-1]
            maxv[0:span-1] = pd.Series(aux_max)[0:span-1]
            smav[0:span-1] = pd.Series(aux_sma)[0:span-1]
            stdv[0:span-1] = pd.Series(aux_std)[0:span-1].fillna(0)
            emav = pd.Series(s.ewm(span=span,adjust=False).mean())
            resultado["Min"+str(span)+"_"+column] = minv
            resultado["Max"+str(span)+"_"+column] = maxv
            resultado["SMA"+str(span)+"_"+column] = smav
            resultado["STD"+str(span)+"_"+column] = stdv
            resultado["EMA"+str(span)+"_"+column] = emav
        resultado = resultado.copy()
    return resultado.copy()

### Função para seleção por SelectFromModel

In [3]:
def getSFM(df,y):
    selector_RF = SelectFromModel(estimator=RandomForestClassifier(max_depth=20,n_jobs=n_jobs,random_state=random_state)).fit(df,y)
    data = df.loc[:,selector_RF.get_support()].copy()
    return data

### Função para seleção por Recursive Feature Elimination com Cross Validation de 10-Fold

In [4]:
def rfecv10(df,y):
    estimator = RandomForestClassifier(max_depth=20,random_state=random_state)
    min_features_to_select = 1
    rfecv = RFECV(estimator=estimator,cv=StratifiedKFold(10),n_jobs=n_jobs,scoring="f1",min_features_to_select=min_features_to_select)
    rfecv.fit(df, y)
    data = df.loc[:,rfecv.get_support()].copy()
    return data

## Carga do dataset ou par de dataset
Escolher conforme construção da derivação (train#.csv+test#.csv ou apenas test#.csv)

In [72]:
# PAR
df1 = pd.read_csv(path+file1,sep=sep)
df2 = pd.read_csv(path+file2,sep=sep)
dft = pd.concat([df1,df2],axis=0).reset_index(drop=True)
print(df1.shape)
print(df2.shape)
print(dft.shape)

(201600, 88)
(86400, 88)
(288000, 88)


In [81]:
# SINGLE
dft = pd.read_csv(path+file1,sep=sep)
dft.shape

(129600, 88)

## Checando informações, retirando features invariáveis e timestamps

In [82]:
dft.columns

Index(['timestamp', 'P1_B2004', 'P1_B2016', 'P1_B3004', 'P1_B3005', 'P1_B4002',
       'P1_B4005', 'P1_B400B', 'P1_B4022', 'P1_FCV01D', 'P1_FCV01Z',
       'P1_FCV02D', 'P1_FCV02Z', 'P1_FCV03D', 'P1_FCV03Z', 'P1_FT01',
       'P1_FT01Z', 'P1_FT02', 'P1_FT02Z', 'P1_FT03', 'P1_FT03Z', 'P1_LCV01D',
       'P1_LCV01Z', 'P1_LIT01', 'P1_PCV01D', 'P1_PCV01Z', 'P1_PCV02D',
       'P1_PCV02Z', 'P1_PIT01', 'P1_PIT01_HH', 'P1_PIT02', 'P1_PP01AD',
       'P1_PP01AR', 'P1_PP01BD', 'P1_PP01BR', 'P1_PP02D', 'P1_PP02R',
       'P1_PP04', 'P1_PP04SP', 'P1_SOL01D', 'P1_SOL03D', 'P1_STSP', 'P1_TIT01',
       'P1_TIT02', 'P1_TIT03', 'P2_24Vdc', 'P2_ATSW_Lamp', 'P2_AutoGO',
       'P2_AutoSD', 'P2_Emerg', 'P2_MASW', 'P2_MASW_Lamp', 'P2_ManualGO',
       'P2_ManualSD', 'P2_OnOff', 'P2_RTR', 'P2_SCO', 'P2_SCST', 'P2_SIT01',
       'P2_TripEx', 'P2_VIBTR01', 'P2_VIBTR02', 'P2_VIBTR03', 'P2_VIBTR04',
       'P2_VT01', 'P2_VTR01', 'P2_VTR02', 'P2_VTR03', 'P2_VTR04', 'P3_FIT01',
       'P3_LCP01D', 'P3_LCV01D', 

In [8]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180001 entries, 0 to 180000
Data columns (total 88 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   timestamp     180001 non-null  object 
 1   P1_B2004      180001 non-null  float64
 2   P1_B2016      180001 non-null  float64
 3   P1_B3004      180001 non-null  float64
 4   P1_B3005      180001 non-null  float64
 5   P1_B4002      180001 non-null  float64
 6   P1_B4005      180001 non-null  float64
 7   P1_B400B      180001 non-null  float64
 8   P1_B4022      180001 non-null  float64
 9   P1_FCV01D     180001 non-null  float64
 10  P1_FCV01Z     180001 non-null  float64
 11  P1_FCV02D     180001 non-null  float64
 12  P1_FCV02Z     180001 non-null  float64
 13  P1_FCV03D     180001 non-null  float64
 14  P1_FCV03Z     180001 non-null  float64
 15  P1_FT01       180001 non-null  float64
 16  P1_FT01Z      180001 non-null  float64
 17  P1_FT02       180001 non-null  float64
 18  P1_F

In [83]:
timestamp = 'timestamp' # 'time', conforme versão do HAI
dft.drop(columns=[timestamp],inplace=True)
dft = dft.loc[:, dft.std() != 0.0]
dft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129600 entries, 0 to 129599
Data columns (total 69 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   P1_B2004      129600 non-null  float64
 1   P1_B2016      129600 non-null  float64
 2   P1_B3004      129600 non-null  float64
 3   P1_B3005      129600 non-null  float64
 4   P1_B4002      129600 non-null  float64
 5   P1_B4005      129600 non-null  float64
 6   P1_B400B      129600 non-null  float64
 7   P1_B4022      129600 non-null  float64
 8   P1_FCV01D     129600 non-null  float64
 9   P1_FCV01Z     129600 non-null  float64
 10  P1_FCV02D     129600 non-null  float64
 11  P1_FCV02Z     129600 non-null  float64
 12  P1_FCV03D     129600 non-null  float64
 13  P1_FCV03Z     129600 non-null  float64
 14  P1_FT01       129600 non-null  float64
 15  P1_FT01Z      129600 non-null  float64
 16  P1_FT02       129600 non-null  float64
 17  P1_FT02Z      129600 non-null  float64
 18  P1_F

## Seleção de features
- SelectFromModel (SFM)
- Extended + SelectFromModel (executar Ext e depois SFM)
- RFECV10 (RFCV10)

### SFM

In [84]:
y = dft['Attack'] # 'attack' para HAI-20
#dft.drop(columns=['attack','attack_P1','attack_P2','attack_P3'],inplace=True) # HAI-20
dft.drop(columns=['Attack'],inplace=True) # HAI-22
data = getSFM(dft,y)
data.shape

(129600, 21)

### Extender - Geração de features
Não esquecer de executar o SFM após.

In [50]:
dft = pd.concat([dft,featurize(dft,periodos)],axis=1)
dft.shape

(288000, 727)

### Recursive Feature Elimination CrossValidation 10 fold

In [76]:
#y = dft['attack'] #HAI-20
y = dft['Attack']
#dft.drop(columns=['attack','attack_P1','attack_P2','attack_P3'],inplace=True) # HAI-20
dft.drop(columns=['Attack'],inplace=True)
data = rfecv10(dft,y)
dft.shape

(288000, 66)

## Criação do meta-atributo train
Para identificar registros de origem em arquivos train#.csv

In [77]:
data.loc[:df1.shape[0],'train'] = 1
data.loc[df1.shape[0]:,'train'] = 0
data['train'] = data['train'].astype(int)

## Salvar o dataset derivado

In [85]:
newfilename = hai+tr+te+mod+".csv"
newpath = "./datasets/"
y.name = "Attack"
data = pd.concat([data,y],axis=1)
data.to_csv(newpath+newfilename,sep=',',index=False)