# Importacao dos bibliotecas

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import RFE, VarianceThreshold
from feature_engine.selection import SmartCorrelatedSelection
from sklearn.ensemble import RandomForestClassifier
import warnings
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import pickle
import os
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")
warnings.simplefilter(action="ignore", category=FutureWarning)

# Coleta dos dados

In [3]:
teste = pd.read_csv("dados/air_system_present_year.csv")
treino = pd.read_csv("dados/air_system_previous_years.csv")

# Tratando os dados

In [4]:
treino_trat = treino.copy()
test_trat = teste.copy()

In [5]:
# Tratamento dos dados 
## Remover colunas com valores ausentes e 0 maior de 40%
def remove_valores_ausentes_zero(treino, teste, label):
    lista_valores_ausentes_zero = []
    for coluna in treino.columns:
        if coluna == label:
            continue
        else:
            if (treino[coluna].isnull().sum() + sum(treino[coluna] == 0)) / len(
                treino
            ) >= 0.4:
                lista_valores_ausentes_zero.append(coluna)
    treino.drop(columns=lista_valores_ausentes_zero, inplace=True)
    teste.drop(columns=lista_valores_ausentes_zero, inplace=True)
    ## Selecionar linhas com pelo menos 50 colunas preenchidas
    treino = treino[treino.notnull().sum(axis=1) >= 50]
    teste = teste[teste.notnull().sum(axis=1) >= 50]
    return treino, teste


def colunas_numericas(treino, teste, label):
    for coluna in treino.columns:
        if coluna == label:
            continue
        else:
            treino[coluna] = pd.to_numeric(treino[coluna], errors="coerce")
            teste[coluna] = pd.to_numeric(teste[coluna], errors="coerce")
    return treino, teste

In [37]:
treino_trat = treino.copy()
test_trat = teste.copy()
##Transformar colunas em numéricas
treino_trat, test_trat = colunas_numericas(treino_trat, test_trat, "class")
## Remover colunas com valores ausentes e 0 maior de 40% e Selecionar linhas com pelo menos 50 colunas preenchidas
treino_trat, test_trat = remove_valores_ausentes_zero(treino_trat, test_trat, "class")
##Transfomando as classes em numericas
treino_trat["class"] = np.where(treino_trat["class"] == "pos", 1, 0)
test_trat["class"] = np.where(test_trat["class"] == "pos", 1, 0)

## Divisao e y e x
y_treino = treino_trat["class"]
x_treino = treino_trat.drop(columns="class")
y_teste = test_trat["class"]
x_teste = test_trat.drop(columns="class")

print("Shape do x_treino:", x_treino.shape)
print("Shape do y_treino:", y_treino.shape)
print("Shape do x_teste:", x_teste.shape)
print("Shape do y_teste:", y_teste.shape)

Shape do x_treino: (59606, 99)
Shape do y_treino: (59606,)
Shape do x_teste: (15891, 99)
Shape do y_teste: (15891,)


In [34]:
x_treino.head(5)

Unnamed: 0,aa_000,ac_000,ad_000,ag_004,ag_005,ag_006,ag_007,ah_000,an_000,ao_000,...,ed_000,ee_000,ee_001,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008
0,76698,2130706000.0,280.0,37250.0,1432864.0,3664156.0,1007684.0,2551696.0,4933296.0,3655166.0,...,2712.0,965866.0,1706908.0,1240520.0,493384.0,721044.0,469792.0,339156.0,157956.0,73224.0
1,33058,0.0,,18254.0,653294.0,1720800.0,516724.0,1393352.0,2560898.0,2127150.0,...,2334.0,664504.0,824154.0,421400.0,178064.0,293306.0,245416.0,133654.0,81140.0,97576.0
2,41040,228.0,100.0,1648.0,370592.0,1883374.0,292936.0,1234132.0,2371990.0,2173634.0,...,1020.0,262032.0,453378.0,277378.0,159812.0,423992.0,409564.0,320746.0,158022.0,95128.0
3,12,70.0,66.0,2212.0,3232.0,1872.0,0.0,2668.0,10184.0,7554.0,...,54.0,5670.0,1566.0,240.0,46.0,58.0,44.0,10.0,0.0,0.0
4,60874,1368.0,458.0,43752.0,1966618.0,1800340.0,131646.0,1974038.0,3230626.0,2618878.0,...,1176.0,404740.0,904230.0,622012.0,229790.0,405298.0,347188.0,286954.0,311560.0,433954.0


In [7]:
pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("resampled", RandomOverSampler(sampling_strategy=1)),
        ("scaler", StandardScaler()),
        ("variance", VarianceThreshold(threshold=0.3)),
        (
            "correlacao",
            SmartCorrelatedSelection(selection_method="variance", threshold=0.7),
        ),
        ("rfe", RFE(estimator=RandomForestClassifier(), n_features_to_select=20)),
        #("clf", RandomForestClassifier()),
    ]
)

In [8]:
pipe.fit(x_treino, y_treino)

In [17]:
x_teste_transformed = pipe.transform(x_teste)
x_treino_transformed = pipe.transform(x_treino)
print('Shape dos dados de teste:', x_teste_transformed.shape)
print('Shape dos dados de treino:', x_treino_transformed.shape)

Shape dos dados de teste: (15891, 20)
Shape dos dados de treino: (59606, 20)


# Salvando os dados

In [21]:
dados_tratado ={}
dados_tratado["x_treino"] = x_treino_transformed
dados_tratado["y_treino"] = y_treino
dados_tratado["x_teste"] = x_teste_transformed
dados_tratado["y_teste"] = y_teste
with open("dados/dados_tratados.pickle", "wb") as f:
    pickle.dump(dados_tratado, f)