# EDA for energy anomaly detection data
Data sourced from Kaggle competition: https://www.kaggle.com/competitions/energy-anomaly-detection

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn import preprocessing
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


## Data structures

In [None]:
print(df_train.anomaly.unique())

In [None]:
df_train = pd.read_csv("../DATA/energy-anomaly-detection/train_features.csv")
df_weather = pd.read_csv("../DATA/energy-anomaly-detection/train.csv")
df_md = pd.read_csv("../DATA/energy-anomaly-detection/train.csv")

print(df_train.dtypes)

## SVM

The first model tested is an SVM, given its demonstrated lightweight use-cases for off-the rack classifiers.

In [None]:
# Instantiate the classifier object
SVM_classifier = svm.SVC(kernel='poly', degree=5, C=10, coef0=1)
# Instantiate the scaler object
scaler = preprocessing.StandardScaler()

# Normalize data
X_train = scaler.fit_transform(df_train.drop('anomaly', axis=1))

# Train the classifier
SVM_classifier.fit(X_train.drop('anomaly', axis=1), df_train['anomaly'])

## XGBoost

In [None]:
estimators = [
    ('encoder', TargetEncoder()),
    ('clf', XGBClassifier(random_state=8)) # can customize objective function with the objective parameter
]
pipe = Pipeline(steps=estimators)
pipe

In [None]:
search_space = {
    'clf__max_depth': Integer(2,8),
    'clf__learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'clf__subsample': Real(0.5, 1.0),
    'clf__colsample_bytree': Real(0.5, 1.0),
    'clf__colsample_bylevel': Real(0.5, 1.0),
    'clf__colsample_bynode' : Real(0.5, 1.0),
    'clf__reg_alpha': Real(0.0, 10.0),
    'clf__reg_lambda': Real(0.0, 10.0),
    'clf__gamma': Real(0.0, 10.0)
}

opt = BayesSearchCV(pipe, search_space, cv=3, n_iter=10, scoring='roc_auc', random_state=8) 
# in reality, you may consider setting cv and n_iter to higher values

In [None]:
opt.fit(df_train.drop('anomaly', axis=1), df_train['anomaly'])

In [None]:
from xgboost import plot_importance

xgboost_step = opt.best_estimator_.steps[1]
xgboost_model = xgboost_step[1]
plot_importance(xgboost_model,max_num_features=10)

## LSTM