In [799]:
%%bash
set -e

pip install virtualenv

# Create virtual environment
python3 -m virtualenv myenv
source ./myenv/bin/activate
./myenv/bin/pip install ipykernel
./myenv/bin/pip install numpy pandas scikit-learn matplotlib seaborn imblearn
./myenv/bin/pip freeze > requirements.txt
python3 -m ipykernel install --user --name=myenv --display-name "Python3 (myenv)"

created virtual environment CPython3.10.12.final.0-64 in 139ms
  creator CPython3Posix(dest=/home/joao/inatel/P10/C318/Project/myenv, clear=False, no_vcs_ignore=False, global=False)
  seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/home/joao/snap/code/174/.local/share/virtualenv)
    added seed packages: asttokens==2.4.1, comm==0.2.2, contourpy==1.3.1, cycler==0.12.1, debugpy==1.8.8, decorator==5.1.1, distlib==0.3.9, exceptiongroup==1.2.2, executing==2.1.0, filelock==3.16.1, fonttools==4.55.0, imbalanced_learn==0.12.4, imblearn==0.0, ipykernel==6.29.5, ipython==8.29.0, jedi==0.19.2, joblib==1.4.2, jupyter_client==8.6.3, jupyter_core==5.7.2, kiwisolver==1.4.7, matplotlib==3.9.2, matplotlib_inline==0.1.7, nest_asyncio==1.6.0, numpy==2.1.3, packaging==24.2, pandas==2.2.3, parso==0.8.4, pexpect==4.9.0, pillow==11.0.0, pip==24.3.1, platformdirs==4.3.6, prompt_toolkit==3.0.48, psutil==6.1.0, ptyprocess==0.7.0, pure_eval==0.2.3, pygments

In [800]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from datetime import datetime, timezone
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import make_scorer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [801]:
# Criando a função de scorer personalizada (com base no classification_report)
def custom_scorer(y_true, y_pred):
    # Manter o valor de 1 como normal e -1 como anômalo, já que estamos tratando de rótulos do tipo +1 e -1
    y_pred = np.where(y_pred == 1, 1, -1)  # Certificar-se de que os rótulos estão como +1 e -1
    return classification_report(y_true, y_pred, output_dict=True)['f1']

def train(X_train, Y_train, X_test, Y_test):
    model = IsolationForest(random_state=42)

    param_grid = {
        'n_estimators': [50, 100, 200],  # Number of trees
        'max_samples': ['auto', 0.5, 0.7],  # Fraction of samples to use for fitting each tree
        'contamination': ['auto', 0.01, 0.05, 0.1],  # Fraction of outliers
        'max_features': [1.0, 0.5]  # Number of features to consider when splitting a node
    }

    scorer = make_scorer(custom_scorer, greater_is_better=True)

    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1, scoring=scorer)

    # Fit
    grid_search.fit(X_train)

    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    return best_model, best_params

def evaluate(y_pred, y_test):
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

### Reading both Datasets

In [802]:
df_night = pd.read_csv('features_night.csv')
df_day = pd.read_csv('features_day.csv')

### Working with Dataset from Night

In [803]:
df_night.columns

Index(['Unnamed: 0', 'state', 'uRms', 'iRms', 'reactivePotency',
       'luminosityLux', 'fail'],
      dtype='object')

In [804]:
df_features_night = df_night.drop(columns=['Unnamed: 0', 'state', 'fail'])
df_features_night.columns

Index(['uRms', 'iRms', 'reactivePotency', 'luminosityLux'], dtype='object')

In [805]:
df_target_night = df_night['fail']
df_target_night.value_counts()

fail
False    813
True      43
Name: count, dtype: int64

In [806]:
df_target_night = pd.Series(np.where(df_target_night, -1, 1), index=df_target_night.index)

In [807]:
model = IsolationForest(random_state=42)
y_pred = model.fit_predict(df_features_night)

In [808]:
evaluate(y_pred, df_target_night)

Classification Report:
              precision    recall  f1-score   support

          -1       0.12      0.26      0.16        43
           1       0.96      0.90      0.93       813

    accuracy                           0.86       856
   macro avg       0.54      0.58      0.54       856
weighted avg       0.92      0.86      0.89       856

Confusion Matrix:
[[ 11  32]
 [ 84 729]]


### Working with Dataset from Day

In [809]:
df_day.columns

Index(['Unnamed: 0', 'state', 'uRms', 'iRms', 'reactivePotency',
       'luminosityLux', 'fail'],
      dtype='object')

In [810]:
df_features_day = df_day.drop(columns=['Unnamed: 0', 'state', 'fail'])
df_features_day.columns

Index(['uRms', 'iRms', 'reactivePotency', 'luminosityLux'], dtype='object')

In [811]:
df_target_day = df_day['fail']
df_target_day.value_counts()

fail
False    770
True      38
Name: count, dtype: int64

In [812]:
df_target_day = pd.Series(np.where(df_target_day, -1, 1), index=df_target_day.index)

In [813]:
model = IsolationForest(random_state=42)
y_pred = model.fit_predict(df_features_day)

In [814]:
evaluate(y_pred, df_target_day)

Classification Report:
              precision    recall  f1-score   support

          -1       0.03      0.08      0.04        38
           1       0.95      0.86      0.90       770

    accuracy                           0.83       808
   macro avg       0.49      0.47      0.47       808
weighted avg       0.91      0.83      0.86       808

Confusion Matrix:
[[  3  35]
 [106 664]]
