In [8]:
import pandas as pd
import numpy as np

np.random.seed(100)

In [9]:
def augment_data(input_data, spiral_radius):
    data_aug = input_data.assign(
        x_pos = (input_data["x_pos"] - spiral_radius) / spiral_radius, 
        y_pos = (input_data["y_pos"] - spiral_radius) / spiral_radius
    )
    data_aug = data_aug.assign(
        time = (data_aug["time"] - data_aug.loc[0, "time"]) / 1000,
        magnitude=np.linalg.norm(data_aug[["x_pos", "y_pos"]], axis=1), 
        theta=np.arctan2(data_aug["y_pos"], data_aug["x_pos"])
    )
    
    data_aug = data_aug.assign(
        distance = data_aug["magnitude"].diff(),
        angular_velocity = data_aug["theta"].diff() / data_aug["time"].diff()
    )
    turn_count = 0
    turns = np.array([])
    for row_idx in data_aug.index:
        if data_aug.loc[row_idx, "angular_velocity"] > np.pi*2:
            turn_count += 1
            
        turns = np.append(turns, turn_count)
            
    data_aug = data_aug.assign(
        turns = turns,
        theta = data_aug["theta"] + turns * 2 * np.pi
    )
    
    data_aug = data_aug.assign(
        angular_velocity = data_aug["theta"].diff() / data_aug["time"].diff()
    )
    return data_aug

def create_feature(spiral_data):
    spiral_data: pd.DataFrame
    mean_values = np.mean(spiral_data, axis=0)
    sum_values = np.sum(spiral_data, axis=0)
    rms_vals = np.sqrt(np.mean(spiral_data**2, axis=0))
    
    return np.array(np.concatenate([mean_values.values, sum_values.values, rms_vals.values]))

In [10]:
file_path = "hw_dataset/control/C_0001.txt"
# X ; Y; Z; Pressure; GripAngle; Timestamp; Test ID
test_data = pd.read_csv(file_path, sep=";", names=["x_pos", "y_pos", "z_pos", "pressure", "grip_angle", "time", "test_id"])
static_data = test_data.loc[test_data["test_id"] == 0]
dynamic_data = test_data.loc[test_data["test_id"] == 1]

static_data_aug = augment_data(static_data, spiral_radius=200)
spiral_feature = create_feature(static_data_aug)

# Load spiral dataset

In [11]:
import os
col_names = ["x_pos", "y_pos", "z_pos", "pressure", "grip_angle", "time", "test_id"]

feature_df = pd.DataFrame(data=None, columns=(["label", "set_id"] + [f"predictor_{idx}" for idx in range(36)]))

base_path = "hw_dataset"
for dataset in ("control", "parkinson"):
    file_names = os.listdir(f"hw_dataset/{dataset}")
    file_paths = [os.path.join(base_path, dataset, file_name) for file_name in file_names if ".txt" in file_name]
    for idx, file_path in enumerate(file_paths):
        test_data = pd.read_csv(file_path, sep=";", names=col_names)
        static_data = test_data.loc[test_data["test_id"] == 0]
        static_data_aug = augment_data(static_data, spiral_radius=200)
        spiral_feature = create_feature(static_data_aug)
        if dataset == "parkinson":
            label = 1
        else:
            label = 0
            
        feature_df.loc[len(feature_df)] = np.append([label, idx], spiral_feature)
        

# sort into training / validation datasets 

In [12]:
def partition_data(df, train_split):

    control_count = sum(df["label"] == 0)
    nd_count = sum(df["label"] == 1)
    control_train_ids = np.random.permutation(range(control_count))[:int(control_count*train_split)]
    nd_train_ids = np.random.permutation(range(nd_count))[:int(nd_count*train_split)]
    
    control_val_ids = np.array([idx for idx in range(control_count) if not np.isin(idx, control_train_ids)])
    nd_val_ids = np.array([idx for idx in range(nd_count) if not np.isin(idx, nd_train_ids)])
    
    train_df = df.loc[np.logical_or(
        np.logical_and((df["label"] == 0), np.isin(df["set_id"], control_train_ids)), 
        np.logical_and((df["label"] == 1), np.isin(df["set_id"], nd_train_ids)))]
    
    val_df = df.loc[np.logical_or(
        np.logical_and((df["label"] == 0), np.isin(df["set_id"], control_val_ids)), 
        np.logical_and((df["label"] == 1), np.isin(df["set_id"], nd_val_ids)))]
    
    return train_df, val_df

In [17]:
from sklearn.linear_model import LinearRegression

accuracies = []
for fold in range(1000):
    train_set, val_set = partition_data(feature_df, 0.8)
    
    train_data = train_set.drop(["label", "set_id"], axis=1)
    val_data = val_set.drop(["label", "set_id"], axis=1)
    model = LinearRegression().fit(train_data, train_set["label"])
    
    predictions = model.predict(val_data)
    classification = [val > 0.5 for val in predictions]
    
    accuracy = (np.mean(np.equal(classification, val_set.label)))
    accuracies.append(accuracy)

In [18]:
print(print(f"Accuracy: {np.mean(accuracies)*100}%"))

Accuracy: 69.4375%
None
