In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

import pandas as pd
import numpy as np
import json
import os

from util import Constants

In [18]:
datasets = []

# --------------------- > covariance from raw data  imagery and shapes
ddict = {
    "dataset_url": "./cov-raw/covariance_imagery_and_shapes_1750.csv",
    "result_dataset_url": "./results/randomForest/rf_covariance_imagery_and_shapes_1750.csv",
    "feature_extract_arg_k": [5, 10, 20, 50],
    "random_forest_estimators": [
        5,
        10,
        15,
        20,
        25,
        30,
        35,
        40,
        50,
        55,
        80,
        100,
        150,
        200,
        300,
    ],
    "data_cols": Constants.data_columns,
}
datasets.append(ddict)

# --------------------- > covariance from raw data only imagery
ddict = {
    "dataset_url": "./cov-raw/covariance_only_imagery_1250.csv",
    "result_dataset_url": "./results/randomForest/rf_covariance_only_imagery_1250.csv",
    "feature_extract_arg_k": [5, 10, 20, 50],
    "random_forest_estimators": [
        5,
        10,
        15,
        20,
        25,
        30,
        35,
        40,
        50,
        55,
        80,
        100,
        150,
        200,
        300,
    ],
    "data_cols": Constants.data_columns,
}
datasets.append(ddict)

# --------------------- > covariance from raw data only shapes
ddict = {
    "dataset_url": "./cov-raw/covariance_only_shapes_500.csv",
    "result_dataset_url": "./results/randomForest/rf_covariance_only_shapes_500.csv",
    "feature_extract_arg_k": [5, 10, 20, 50],
    "random_forest_estimators": [
        5,
        10,
        15,
        20,
        25,
        30,
        35,
        40,
        50,
        55,
        80,
        100,
        150,
        200,
        300,
    ],
    "data_cols": Constants.data_columns,
}
datasets.append(ddict)

# --------------------- > covariance from frequencies data imagery and shapes
ddict = {
    "dataset_url": "./cov-freq/covariance_frequency_imagery_and_shapes_1750.csv",
    "result_dataset_url": "./results/randomForest/rf_covariance_frequency_imagery_and_shapes_1750.csv",
    "feature_extract_arg_k": [5, 10, 20, 50, 100, 150, 180, 200, 250, 300, 500],
    "random_forest_estimators": [
        5,
        10,
        15,
        20,
        25,
        30,
        35,
        40,
        50,
        55,
        80,
        100,
        150,
        200,
        300,
        400,
        500,
        600,
    ],
    "data_cols": Constants.frequency_data_columns,
}
datasets.append(ddict)

# --------------------- > covariance from frequencies data only imagery
ddict = {
    "dataset_url": "./cov-freq/covariance_frequency_only_imagery_1250.csv",
    "result_dataset_url": "./results/randomForest/rf_covariance_frequency_only_imagery_1250.csv",
    "feature_extract_arg_k": [5, 10, 20, 50, 100, 150, 180, 200, 250, 300, 500],
    "random_forest_estimators": [
        5,
        10,
        15,
        20,
        25,
        30,
        35,
        40,
        50,
        55,
        80,
        100,
        150,
        200,
        300,
        400,
        500,
        600,
    ],
    "data_cols": Constants.frequency_data_columns,
}
datasets.append(ddict)


# --------------------- > covariance from frequencies data only shapes
ddict = {
    "dataset_url": "./cov-freq/covariance_frequency_only_shapes_500.csv",
    "result_dataset_url": "./results/randomForest/rf_covariance_frequency_only_shapes_500.csv",
    "feature_extract_arg_k": [5, 10, 20, 50, 100, 150, 180, 200, 250, 300, 500],
    "random_forest_estimators": [
        5,
        10,
        15,
        20,
        25,
        30,
        35,
        40,
        50,
        55,
        80,
        100,
        150,
        200,
        300,
        400,
        500,
        600,
    ],
    "data_cols": Constants.frequency_data_columns,
}
datasets.append(ddict)

In [19]:
def read_dataset(dataset_url):
    df = pd.read_csv(dataset_url)
    df.reset_index(drop=True, inplace=True)
    df.drop(columns=df.columns[0], axis=1, inplace=True)
    df.head()
    return df


In [20]:
def convert_string_data_to_values(value_string):
  str_list = np.array(json.loads(value_string))
  return str_list

def convert_dataset(dataset,cols):
  df = dataset.copy()
  for col in cols:
    df[col] = df[col].apply(convert_string_data_to_values)
  return df

In [21]:
def split_dataset(dataset, subject,label):
    test_dataset = dataset.copy()
    test_dataset = test_dataset[test_dataset['subject'] == subject]
    test_dataset['label'] = test_dataset['label'].apply(lambda x: label if x == label else 'Imagery')
    return test_dataset

In [22]:
def feature_extract(X,y,k):
    # k is number of top features to select
    selector = SelectKBest(score_func=f_classif, k=k)
    X_new = selector.fit_transform(X, y)
    return X_new

In [23]:
def calc_accuracy(dataset, data_cols,n_estimators = 30, k=20):
    df = dataset.copy()
    le = preprocessing.LabelEncoder()  # Generates a look-up table
    le.fit(df.loc[:, "label"])
    df["label"] = le.transform(df["label"])

    X = []
    for channels in df[data_cols].values:
        scaled_channels = []
        for c in channels:
            scaled_channels.append(c.reshape(-1, 1))
        X.append(scaled_channels)

    X = np.asarray(X)
    X = X.reshape(X.shape[0], -1)
    y = [i for i in df["label"]]

    X = feature_extract(X, y, k)

    # split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    RFC = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    RFC.fit(X_train, y_train)

    y_pred = RFC.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [24]:
for dataset in datasets:
    df = read_dataset(dataset_url=dataset["dataset_url"])
    df = convert_dataset(dataset=df, cols=dataset["data_cols"])

    subjects = df["subject"].unique()
    labels = df["label"].unique()
    max_avg_total = 0
    final_accuracies = {}
    for feature_extract_k in dataset["feature_extract_arg_k"]:
        for estimators in dataset["random_forest_estimators"]:
            accuracies = {}
            avg_total = 0
            labels_accuracies = {}
            for subject in subjects:
                accuracies[subject] = {}
                avg = 0
                for label in labels:
                    if not label in labels_accuracies:
                        labels_accuracies[label] = 0

                    # main part is here other is just for computing accuracies
                    temp_dataset = split_dataset(df, subject, label)
                    acc = calc_accuracy(
                        temp_dataset,
                        dataset["data_cols"],
                        n_estimators=estimators,
                        k=feature_extract_k,
                    )
                    acc = calc_accuracy(
                        temp_dataset,
                        dataset["data_cols"],
                        k=feature_extract_k,
                    )

                    accuracies[subject][label] = acc
                    labels_accuracies[label] += acc
                    avg += acc
                    avg_total += acc

                avg = avg / len(labels)
                accuracies[subject]["avg"] = avg * 100

            avg_total = avg_total / (len(subjects) * len(labels))
            if max_avg_total < avg_total:
                max_avg_total = avg_total
                labels_accuracies = {
                    key: value / len(subjects) for key, value in labels_accuracies.items()
                }

                print(
                    f"Total Average for { dataset['dataset_url'] } with k:{feature_extract_k} and estimators: {estimators} is {avg_total*100}"
                )
                print()

                accuracies["avg_total"] = {}
                accuracies["k"] = {"avg": feature_extract_k}
                accuracies["estimators"] = {"avg": estimators}
                for key, lAcc in labels_accuracies.items():
                    accuracies["avg_total"][key] = lAcc
                accuracies["avg_total"]["avg"] = avg_total * 100

                final_accuracies = accuracies

    os.makedirs(os.path.dirname(dataset["result_dataset_url"]), exist_ok=True)
    pd.DataFrame(final_accuracies).to_csv(dataset["result_dataset_url"])

Total Average for ./cov-raw/covariance_imagery_and_shapes_1750.csv with k:5 and estimators: 5 is 83.01564350436533

Total Average for ./cov-raw/covariance_imagery_and_shapes_1750.csv with k:10 and estimators: 5 is 83.14935690875542

Total Average for ./cov-raw/covariance_imagery_and_shapes_1750.csv with k:20 and estimators: 5 is 83.8452587512738

Total Average for ./cov-raw/covariance_imagery_and_shapes_1750.csv with k:50 and estimators: 5 is 84.24281858492388

Total Average for ./cov-raw/covariance_only_imagery_1250.csv with k:5 and estimators: 5 is 83.14116334793029

Total Average for ./cov-raw/covariance_only_imagery_1250.csv with k:10 and estimators: 5 is 83.44418739155583

Total Average for ./cov-raw/covariance_only_imagery_1250.csv with k:20 and estimators: 5 is 84.27118620727644

Total Average for ./cov-raw/covariance_only_shapes_500.csv with k:5 and estimators: 5 is 82.61952333380903

Total Average for ./cov-raw/covariance_only_shapes_500.csv with k:10 and estimators: 5 is 83.7