In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import tree

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

import pandas as pd
import numpy as np
import json
import os

from util import Constants

In [10]:
datasets = []

# --------------------- > covariance from raw data  imagery and shapes
ddict = {
    "dataset_url": "./cov-raw/covariance_imagery_and_shapes_1750.csv",
    "result_dataset_url": "./results/DTC/dtc_covariance_imagery_and_shapes_1750.csv",
    "feature_extract_arg_k": [5, 10, 20, 50],
    "data_cols": Constants.data_columns,
}
datasets.append(ddict)

# --------------------- > covariance from raw data only imagery
ddict = {
    "dataset_url": "./cov-raw/covariance_only_imagery_1250.csv",
    "result_dataset_url": "./results/DTC/dtc_covariance_only_imagery_1250.csv",
    "feature_extract_arg_k": [5, 10, 20, 50],
    "data_cols": Constants.data_columns,
}
datasets.append(ddict)

# --------------------- > covariance from raw data only shapes
ddict = {
    "dataset_url": "./cov-raw/covariance_only_shapes_500.csv",
    "result_dataset_url": "./results/DTC/dtc_covariance_only_shapes_500.csv",
    "feature_extract_arg_k": [5, 10, 20, 50],
    "data_cols": Constants.data_columns,
}
datasets.append(ddict)

# --------------------- > covariance from frequencies data imagery and shapes
ddict = {
    "dataset_url": "./cov-freq/covariance_frequency_imagery_and_shapes_1750.csv",
    "result_dataset_url": "./results/DTC/dtc_covariance_frequency_imagery_and_shapes_1750.csv",
    "feature_extract_arg_k": [5, 10, 20, 50, 100, 150, 180, 200, 250, 300, 500],
    "data_cols": Constants.frequency_data_columns,
}
datasets.append(ddict)

# --------------------- > covariance from frequencies data only imagery
ddict = {
    "dataset_url": "./cov-freq/covariance_frequency_only_imagery_1250.csv",
    "result_dataset_url": "./results/DTC/dtc_covariance_frequency_only_imagery_1250.csv",
    "feature_extract_arg_k": [5, 10, 20, 50, 100, 150, 180, 200, 250, 300, 500],
    "data_cols": Constants.frequency_data_columns,
}
datasets.append(ddict)


# --------------------- > covariance from frequencies data only shapes
ddict = {
    "dataset_url": "./cov-freq/covariance_frequency_only_shapes_500.csv",
    "result_dataset_url": "./results/DTC/dtc_covariance_frequency_only_shapes_500.csv",
    "feature_extract_arg_k": [5, 10, 20, 50, 100, 150, 180, 200, 250, 300, 500],
    "data_cols": Constants.frequency_data_columns,
}
datasets.append(ddict)

In [11]:
def read_dataset(dataset_url):
    df = pd.read_csv(dataset_url)
    df.reset_index(drop=True, inplace=True)
    df.drop(columns=df.columns[0], axis=1, inplace=True)
    df.head()
    return df


In [12]:
def convert_string_data_to_values(value_string):
  str_list = np.array(json.loads(value_string))
  return str_list

def convert_dataset(dataset,cols):
  df = dataset.copy()
  for col in cols:
    df[col] = df[col].apply(convert_string_data_to_values)
  return df

In [13]:
def split_dataset(dataset, subject,label):
    test_dataset = dataset.copy()
    test_dataset = test_dataset[test_dataset['subject'] == subject]
    test_dataset['label'] = test_dataset['label'].apply(lambda x: label if x == label else 'Imagery')
    return test_dataset

In [14]:
def feature_extract(X,y,k):
    # k is number of top features to select
    selector = SelectKBest(score_func=f_classif, k=k)
    return selector.fit_transform(X, y)

In [15]:
def calc_accuracy(dataset, data_cols, k=20):
    df = dataset.copy()
    le = preprocessing.LabelEncoder()  # Generates a look-up table
    le.fit(df.loc[:, "label"])
    df["label"] = le.transform(df["label"])

    X = []
    for channels in df[data_cols].values:
        scaled_channels = []
        for c in channels:
            scaled_channels.append(c.reshape(-1, 1))
        X.append(scaled_channels)

    X = np.asarray(X)
    X = X.reshape(X.shape[0], -1)
    y = [i for i in df["label"]]

    X = feature_extract(X, y, k)

    # split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    DTC = tree.DecisionTreeClassifier(criterion="entropy")
    DTC.fit(X_train, y_train)

    y_pred = DTC.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [16]:
for dataset in datasets:
    df = read_dataset(dataset_url=dataset["dataset_url"])
    df = convert_dataset(dataset=df, cols=dataset["data_cols"])

    subjects = df["subject"].unique()
    labels = df["label"].unique()
    max_avg_total = 0
    final_accuracies = {}
    for feature_extract_k in dataset["feature_extract_arg_k"]:
        accuracies = {}
        avg_total = 0
        labels_accuracies = {}
        for subject in subjects:
            accuracies[subject] = {}
            avg = 0
            for label in labels:
                if not label in labels_accuracies:
                    labels_accuracies[label] = 0

                # main part is here other is just for computing accuracies
                temp_dataset = split_dataset(df, subject, label)
                acc = calc_accuracy(
                    temp_dataset,
                    dataset["data_cols"],
                    k=feature_extract_k,
                )

                accuracies[subject][label] = acc
                labels_accuracies[label] += acc
                avg += acc
                avg_total += acc

            avg = avg / len(labels)
            accuracies[subject]["avg"] = avg * 100

        avg_total = avg_total / (len(subjects) * len(labels))
        if max_avg_total < avg_total:
            max_avg_total = avg_total
            labels_accuracies = {
                key: value / len(subjects) for key, value in labels_accuracies.items()
            }

            print(
                f"Total Average for { dataset['dataset_url'] } with k:{feature_extract_k} is {avg_total*100}"
            )
            print()

            accuracies["avg_total"] = {}
            accuracies["k"] = {"avg": feature_extract_k}
            for key, lAcc in labels_accuracies.items():
                accuracies["avg_total"][key] = lAcc
            accuracies["avg_total"]["avg"] = avg_total * 100

            final_accuracies = accuracies

    os.makedirs(os.path.dirname(dataset["result_dataset_url"]), exist_ok=True)
    pd.DataFrame(final_accuracies).to_csv(dataset["result_dataset_url"])

Total Average for ./cov-raw/covariance_imagery_and_shapes_1750.csv with k:5 is 75.73232531879151

Total Average for ./cov-raw/covariance_imagery_and_shapes_1750.csv with k:50 is 75.87381916329286

Total Average for ./cov-raw/covariance_only_imagery_1250.csv with k:5 is 75.90962295849512

Total Average for ./cov-raw/covariance_only_shapes_500.csv with k:5 is 74.75631511345797

Total Average for ./cov-raw/covariance_only_shapes_500.csv with k:10 is 76.34508348794063

Total Average for ./cov-freq/covariance_frequency_imagery_and_shapes_1750.csv with k:5 is 75.20890137431492

Total Average for ./cov-freq/covariance_frequency_imagery_and_shapes_1750.csv with k:10 is 76.17133493073341

Total Average for ./cov-freq/covariance_frequency_imagery_and_shapes_1750.csv with k:20 is 78.1822000055083

Total Average for ./cov-freq/covariance_frequency_only_imagery_1250.csv with k:5 is 75.18328789005481

Total Average for ./cov-freq/covariance_frequency_only_imagery_1250.csv with k:10 is 76.07975983915