In [None]:
%load_ext nb_black

In [None]:
import csv
import itertools
import os
from dataclasses import dataclass
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from tqdm.notebook import tqdm

In [None]:
files = !(find ../../UCRArchive_2018/ -type f -name "*TRAIN.tsv" -exec ls -al {} \; | sort -k 5 -n | sed 's/ \+/\t/g' | cut -f 9)
files

In [None]:
@dataclass
class FileNames:

    name: str

    train_file: str
    test_file: str

    train_dtw: str
    train_fastdtw: str
    train_sakoechiba: str

    test_dtw: str
    test_fastdtw: str
    test_sakoechiba: str

In [None]:
sort_files = []

for file_name in tqdm(files):
    name = file_name.split("/")[-1].replace("_TRAIN.tsv", "")
    test_file = file_name.replace("TRAIN.tsv", "TEST.tsv")

    train_dtw = file_name.replace(".tsv", "_train_dtw.csv")
    train_fastdtw = file_name.replace(".tsv", "_train_fastdtw.csv")
    train_sakoechiba = file_name.replace(".tsv", "_train_sakoechibadtw.csv")

    test_dtw = test_file.replace(".tsv", "_train_dtw.csv")
    test_fastdtw = test_file.replace(".tsv", "_train_fastdtw.csv")
    test_sakoechiba = test_file.replace(".tsv", "_train_sakoechibadtw.csv")

    if not all(
        [
            os.path.exists(x)
            for x in (
                train_dtw,
                train_fastdtw,
                train_sakoechiba,
                test_dtw,
                test_fastdtw,
                test_sakoechiba,
            )
        ]
    ):
        continue

    fl = FileNames(
        name=name,
        train_file=file_name,
        test_file=test_file,
        train_dtw=train_dtw,
        train_fastdtw=train_fastdtw,
        train_sakoechiba=train_sakoechiba,
        test_dtw=test_dtw,
        test_fastdtw=test_fastdtw,
        test_sakoechiba=test_sakoechiba,
    )

    frame = pd.read_csv(file_name, delimiter="\t", header=None)
    sort_files.append([*frame.shape, fl])

In [None]:
sort_files = sorted(sort_files, key=lambda x: x[0] * x[1] ** 2)

In [None]:
np.random.seed(42)

with open(
    f"../logs/classification-random-results-{datetime.now().isoformat()}.csv", "w"
) as out_file:
    writer = csv.writer(out_file, delimiter=",")
    writer.writerow(
        [
            "dataset",
            *[
                "_".join(x)
                for x in itertools.product(
                    ("dtw", "fastdtw", "sakoechiba"),
                    ("SvmLinear", "SvmPoly", "SvmRbf", "RandomForest"),
                )
            ],
        ]
    )
    for n_samples, n_len, file_name in tqdm(sort_files):

        name = file_name.name

        row = [name]

        train_frame = pd.read_csv(file_name.train_file, delimiter="\t", header=None)
        test_frame = pd.read_csv(file_name.test_file, delimiter="\t", header=None)

        classes = train_frame[0].values
        y_true = test_frame[0].values

        train_dtw = pd.read_csv(file_name.train_dtw, delimiter=",", header=None)
        train_fastdtw = pd.read_csv(file_name.train_fastdtw, delimiter=",", header=None)
        train_sakoechiba = pd.read_csv(
            file_name.train_sakoechiba, delimiter=",", header=None
        )

        test_dtw = pd.read_csv(file_name.test_dtw, delimiter=",", header=None)
        test_fastdtw = pd.read_csv(file_name.test_fastdtw, delimiter=",", header=None)
        test_sakoechiba = pd.read_csv(
            file_name.test_sakoechiba, delimiter=",", header=None
        )

        for method, X_train, X_test in tqdm(
            [
                ("dtw", train_dtw, test_dtw),
                ("fastdtw", train_fastdtw, test_fastdtw),
                ("sakoechiba", train_sakoechiba, test_sakoechiba),
            ],
            leave=False,
            desc=f"{name} shape {(n_samples, n_len)}",
        ):

            train_nan = np.any(np.isnan(X_train), axis=1)
            test_nan = np.any(np.isnan(X_test), axis=1)

            X_train = X_train.loc[~train_nan]
            X_test = X_test.loc[~test_nan]

            if not np.all([*X_train.shape, *X_test.shape]):
                row += [0] * 4
                continue

            for c_name, model in tqdm(
                [
                    ("SvmLinear", SVC(kernel="linear", random_state=42)),
                    ("SvmPoly", SVC(kernel="poly", random_state=42)),
                    ("SvmRbf", SVC(kernel="rbf", random_state=42)),
                    (
                        "RandomForest",
                        RandomForestClassifier(n_estimators=100, random_state=42),
                    ),
                ],
                leave=False,
                desc=f"{method}",
            ):

                accs = []
                np.random.seed(42)

                for i in range(10):
                    choosen = np.random.choice(
                        X_train.shape[0],
                        int(0.3 * X_train.shape[0]),
                        replace=False,
                    )

                    model.fit(X=X_train[choosen].values, y=classes[~train_nan])
                    predicted = model.predict(X_test[choosen].values)
                    accs.append(
                        accuracy_score(y_true=y_true[~test_nan], y_pred=predicted)
                    )

                row.append(np.mean(accs))

        writer.writerow(row)
        out_file.flush()