In [3]:
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

The choice of a classification model for a dataset with datetime objects depends on the specific characteristics 
of the data and the task at hand. Here are a few options to consider:

Time Series Classification: 
If the dataset contains a time series of datetime objects and the task is to predict 
a categorical variable at each time step, then time series classification models like Recurrent Neural Networks (RNN), 
Long Short-Term Memory (LSTM), or Gated Recurrent Unit (GRU) can be used.

Feature-based Classification: 
If the dataset has datetime objects as one of several features used to predict a categorical 
target variable, then feature-based classification models like Decision Trees, Random Forest, or Gradient Boosting can be used.

Distance-based Classification: 
If the dataset has datetime objects that need to be compared or clustered based on their similarity, then distance-based 
classification models like k-Nearest Neighbors (k-NN) can be used.

Ensemble-based Classification: 
If the dataset contains datetime objects along with other types of features, then ensemble-based classification models like 
AdaBoost or XGBoost can be used to combine the strengths of multiple models.

In summary, the choice of a classification model for a dataset with datetime objects depends on the specific characteristics 
of the data, the task at hand, and the types of other features available in the dataset.

In [21]:
df = pd.read_csv("data/manipulated_data.csv", delimiter=";", header=3)

In [22]:
df["Date/Time"] = pd.to_datetime(df["Date/Time"])

In [77]:
df.drop(["Manual Outlier"], axis=1)
df

Unnamed: 0,Date/Time,"Water level, Nap (cm)",Manual Outlier,Manual outlier
0,2022-03-08 00:00:04,24,-1,-1
1,2022-03-08 00:30:00,24,-1,-1
2,2022-03-08 01:00:00,24,-1,-1
3,2022-03-08 01:30:00,20,-1,-1
4,2022-03-08 02:00:00,20,-1,-1
...,...,...,...,...
17518,2023-03-07 23:00:00,26,-1,-1
17519,2023-03-07 23:30:00,25,-1,-1
17520,2023-03-08 00:00:04,25,-1,-1
17521,2023-03-08 00:30:00,25,-1,-1


In [78]:
df["Manual outlier"] = -1

df.loc[25:30]

Unnamed: 0,Date/Time,"Water level, Nap (cm)",Manual Outlier,Manual outlier
25,2022-03-08 12:30:00,26,-1,-1
26,2022-03-08 13:00:00,26,1,-1
27,2022-03-08 13:30:04,26,-1,-1
28,2022-03-08 14:00:00,26,-1,-1
29,2022-03-08 14:30:00,24,-1,-1
30,2022-03-08 15:00:00,23,-1,-1


In [79]:
df["Manual outlier"][26] = 1
df["Manual outlier"][83] = 1
df["Manual outlier"][122] = 1
df.loc[25:30]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Manual outlier"][26] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Manual outlier"][83] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Manual outlier"][122] = 1


Unnamed: 0,Date/Time,"Water level, Nap (cm)",Manual Outlier,Manual outlier
25,2022-03-08 12:30:00,26,-1,-1
26,2022-03-08 13:00:00,26,1,1
27,2022-03-08 13:30:04,26,-1,-1
28,2022-03-08 14:00:00,26,-1,-1
29,2022-03-08 14:30:00,24,-1,-1
30,2022-03-08 15:00:00,23,-1,-1


In [80]:
X = df["Water level, Nap (cm)"]
y = df["Manual outlier"]

In [81]:
print(X.shape)
print(y.shape)

(17523,)
(17523,)


In [82]:
X = X.values
X = X.reshape((-1, 1))
print(X.shape)

(17523, 1)


In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)

In [84]:
# Do I need zero_division parameter?
pred = clf.predict(X_test)

acc = accuracy_score(pred, y_test)
recall = recall_score(pred, y_test, average="weighted")
precision = precision_score(pred, y_test, average="weighted")
f1 = f1_score(pred, y_test, average="weighted")

In [85]:
print(f"Accuracy: {round(acc, 3)}, F1: {round(f1, 3)}, Recall: {round(recall, 3)}, Precision: {round(precision, 3)}")

Accuracy: 1.0, F1: 1.0, Recall: 1.0, Precision: 1.0


In [86]:
trials_dictionaries = []

In [87]:
def objective(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    n_neighb = trial.suggest_int("n_neighb", 1, 10)

    clf = KNeighborsClassifier(n_neighbors=n_neighb)
    clf.fit(X_train, y_train)

    return clf.score(X_test, y_test)

In [88]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[32m[I 2023-04-05 14:57:32,041][0m A new study created in memory with name: no-name-824d322b-899d-4c3c-b487-138219a52ac3[0m
[32m[I 2023-04-05 14:57:32,265][0m Trial 0 finished with value: 1.0 and parameters: {'n_neighb': 10}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-04-05 14:57:32,431][0m Trial 1 finished with value: 1.0 and parameters: {'n_neighb': 10}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-04-05 14:57:32,581][0m Trial 2 finished with value: 1.0 and parameters: {'n_neighb': 9}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-04-05 14:57:32,733][0m Trial 3 finished with value: 1.0 and parameters: {'n_neighb': 7}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-04-05 14:57:32,880][0m Trial 4 finished with value: 1.0 and parameters: {'n_neighb': 10}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-04-05 14:57:33,021][0m Trial 5 finished with value: 1.0 and parameters: {'n_neighb': 6}. Best is trial 0 with value: 1.0.[0m
[32m[I 2023-04-05 14:57:33,16

In [89]:
def detailed_objective(trial):
    # Use same code objective to reproduce the best model

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    n_neighb = trial.suggest_int("n_neighb", 1, 10)

    clf = KNeighborsClassifier(n_neighbors=n_neighb)
    clf.fit(X_train, y_train)

    # calculate more evaluation metrics
    pred = clf.predict(X_test)

    acc = accuracy_score(pred, y_test)
    recall = recall_score(pred, y_test, average="weighted")
    precision = precision_score(pred, y_test, average="weighted")
    f1 = f1_score(pred, y_test, average="weighted")

    return acc, f1, recall, precision

In [90]:
eval_metrics = detailed_objective(study.best_trial)

In [91]:
trials_dict = {
    "X": "Date/Time", 
    "y": "Water level, Nap (cm)", 
    "Best score": round(study.best_value, 3), 
    "F1": eval_metrics[1],
    "Recall": eval_metrics[2],
    "Precision": eval_metrics[3],
    "Best parameters": study.best_params,
    }

trials_dictionaries.append(trials_dict)
print("X, y")
print(f"Score: {round(study.best_value, 3)}")
print(f"Best parameters: {study.best_params}")
print(f"F1: {round(eval_metrics[1], 3)}")
print(f"Recall: {round(eval_metrics[2], 3)}")
print(f"Precision: {round(eval_metrics[3], 3)}")

X, y
Score: 1.0
Best parameters: {'n_neighb': 10}
F1: 1.0
Recall: 1.0
Precision: 1.0
