In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import optuna
import pickle
from sklearn.feature_selection import VarianceThreshold
from utils import functions

* Loading Train Data

In [2]:
train_data = pd.read_csv('./databases/air_system_previous_years.csv').replace(to_replace= 'na', value= np.nan)

train_data['class'] = train_data['class'].map({
    'neg' : 0,
    'pos' : 1
})

train_data = train_data.astype(dtype= float).copy()

train_data['class'] = train_data['class'].astype(dtype= int).copy()

y_train = train_data['class']

train_data = train_data.drop(columns=['class'])

* Isolation Forest - Train Set

In [3]:
with open('./artefatos/IsolationForest/pipeline.pkl', mode= 'rb') as file:
    if_pipeline = pickle.load(file)

if_pipeline = if_pipeline.fit(train_data)
train_data['anomaly_scores'] = if_pipeline.decision_function(train_data)

* PCA - Train Set

In [4]:
with open('./artefatos/GMM/PCA_pipeline.pkl', mode= 'rb') as file:
    pca_pipeline = pickle.load(file)

pca_pipeline = pca_pipeline.fit(train_data)
train_data_decomp = pd.DataFrame(pca_pipeline.transform(train_data))

* GMM - Clustering - Train Set

In [5]:
with open('./artefatos/GMM/GMM_model.pkl', mode= 'rb') as file:
    gmm_model = pickle.load(file)
gmm_model = gmm_model.fit(train_data_decomp)
train_data['cluster'] = gmm_model.predict(train_data_decomp)

* XGBoost - Classification - Train Set

In [6]:
with open('./artefatos/Boosting/pipeline.pkl', mode= 'rb') as file:
    xgbm_pipe = pickle.load(file)

with open('./artefatos/Boosting/pipeline_feature_names_out.pkl', mode= 'rb') as file:
    xgbm_feature_names = pickle.load(file)

with open('./artefatos/XGBM/XGBM_best_config.pkl', mode= 'rb') as file:
    xgbm_config = pickle.load(file)

train_data = pd.DataFrame(
    data = xgbm_pipe.fit_transform(train_data),
    columns= xgbm_feature_names
)

train_data, to_drop = functions.remove_highly_correlated_features(
    df= train_data,
    threshold= 0.6
)

var_fs = VarianceThreshold().fit(train_data)

to_drop_variance = train_data.loc[:, ~var_fs.get_support()].columns.to_list()

train_data = train_data.drop(columns= to_drop_variance)

xgbm_model = XGBClassifier(**xgbm_config.params)

xgbm_model = xgbm_model.fit(train_data, y_train)

* Loading Test Data

In [7]:
test_data = pd.read_csv('./databases/air_system_present_year.csv').replace(to_replace= 'na', value= np.nan)

test_data['class'] = test_data['class'].map({
    'neg' : 0,
    'pos' : 1
})

test_data = test_data.astype(dtype= float).copy()

test_data['class'] = test_data['class'].astype(dtype= int).copy()

y_test = test_data['class']

test_data = test_data.drop(columns=['class'])

* Isolation Forest - Test Set

In [8]:
test_data['anomaly_scores'] = if_pipeline.decision_function(test_data)

* PCA - Test Set

In [9]:
test_data_decomp = pd.DataFrame(pca_pipeline.transform(test_data))

GMM - Clustering - Test Set

In [10]:
test_data['cluster'] = gmm_model.predict(test_data_decomp)

* XGBoost - Classification - Test Set

In [11]:
test_data = pd.DataFrame(
    data = xgbm_pipe.transform(test_data),
    columns= xgbm_feature_names
)

test_data = test_data.drop(columns= to_drop)

test_data = test_data.drop(columns= to_drop_variance)

y_pred = xgbm_model.predict(test_data)

functions.loss_function(y_test, y_pred)

29065