In [1]:
import warnings
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler 
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from itertools import product
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import StratifiedKFold
from imblearn.metrics import geometric_mean_score
warnings.filterwarnings("ignore")

# Specify of hyperparameters

In [2]:
model_parameter_rules = {
    SVC: [
        (
            {},
            {
                "C": [0.5, 0.75, 1, 1.5],
                "kernel": ["linear"],
                "class_weight": ["balanced"],
            },
        ),
        ({"kernel":{"poly"}}, {"degree": [2, 3, 4]}), 
        ({"kernel": {"rbf", "sigmoid"}}, {"gamma": ["auto"]}),
    ],
    KNeighborsClassifier: [
        (
            {},
            {
                "n_neighbors": [3, 5, 7, 9, 11, 15],
                "weights": ["uniform", "distance"],
                "metric": [
                    "cityblock",
                    "cosine",
                    "l1",
                    "l2",
                    "nan_euclidean",
                ],
            },
        ),
        ({"metric": ["minkowski"]}, {"p": [1, 2, 3, 4]}),
    ],
    RandomForestClassifier: [
        (
            {},
            {
                "n_estimators": [50, 100, 200, 400, 800, 1600],
                "criterion": ["gini", "entropy", "log_loss"],
                "max_features": ["sqrt", "log2", None],
            },
        )
    ],
    XGBClassifier: [
        (
            {},
            {
                "n_estimators": [25, 50, 100, 200, 400, 800],
                "grow_policy": ["depthwise", "lossguide"],
                "learning_rate": [0.01, 0.1, 1],
            },
        )
    ]
}

# Extracted code from PADDEL library

In [3]:
class HashableDict(dict):
    def __hash__(self):
        return hash(tuple(sorted(self.items())))


def expand_rules(parameter_rules: list) -> dict:
    """Converts list of conditions and options to a dictionary that is easier to
    parse.

    Args:
        parameter_rules (list): Rules to expand.

    Returns:
        dict: Expanded rules.
    """
    expanded_rules = {}

    for conditions, parameters in parameter_rules:
        for values in product(*conditions.values()):
            simple_conditions = HashableDict(
                zip(conditions.keys(), [tuple([v]) for v in values])
            )

            if simple_conditions not in expanded_rules:
                expanded_rules[simple_conditions] = {}

            expanded_rules[simple_conditions].update(parameters)

    return expanded_rules


def matches(conditions: dict, other_conditions: dict) -> bool:
    """Determines if a condition dictionary matches within another.

    Args:
        conditions (dict): Conditions to match.
        other_conditions (dict): Conditions to match with.

    Returns:
        bool: If condition matches.
    """
    return conditions.items() > other_conditions.items()


def merge_parameters(one: dict, other: dict) -> dict:
    """Merge parameters from one dict to another.

    Args:
        one (dict): Dict to merge into.
        other (dict): Dict to merge from.

    Returns:
        dict: Merged dictionary.
    """
    one = one.copy()
    for key in other:
        if key in one:
            one[key] += other[key]
        else:
            one[key] = other[key]

    return one


def parse_hyper_parameters(parameter_rules: list, prefix="") -> list:
    """Parses custom formatted parameter rules to sklearn compatible parameter grid.

    Args:
        parameter_rules (list): Parameter rules.
        prefix (str, optional): Prefix to be used when naming the parameters. Useful when working with pipelines. Defaults to "".

    Returns:
        list: Parameter grid.
    """
    parameter_rules = expand_rules(parameter_rules)

    param_grid = []

    for conditions in parameter_rules:
        parameters = parameter_rules[conditions].copy()
        for other_conditions in parameter_rules:
            if matches(conditions, other_conditions):
                parameters = merge_parameters(
                    parameters, parameter_rules[other_conditions]
                )

        parameters.update({k: list(v) for k, v in conditions.items()})

        param_grid.append(parameters)

    # Rename to match model if using pipeline
    for params in param_grid:
        for key in list(params):
            params[f"{prefix}{key}"] = params.pop(key)

    return param_grid

In [4]:
def load_data():
    """
    Load and concatenate multiple CSV files into a single DataFrame.

    Returns:
    data (pandas.DataFrame): The concatenated DataFrame containing data from misc_df, classic_df, and fresh_df.
    """
    misc_df = pd.read_csv('misc_df.csv')
    classic_df = pd.read_csv('classic_df.csv')
    fresh_df = pd.read_csv('fresh_df.csv')
    data = pd.concat([misc_df, classic_df, fresh_df], axis=1)
    return data

data = load_data()
data.head()

Unnamed: 0,sample_name,date,hand,gender,age,handedness,video_path,detection_time,lent,amp,...,angle__fourier_entropy__bins_5,angle__fourier_entropy__bins_10,angle__fourier_entropy__bins_100,angle__permutation_entropy__dimension_3__tau_1,angle__permutation_entropy__dimension_4__tau_1,angle__permutation_entropy__dimension_5__tau_1,angle__permutation_entropy__dimension_6__tau_1,angle__permutation_entropy__dimension_7__tau_1,angle__query_similarity_count__query_None__threshold_0.0,angle__mean_n_absolute_max__number_of_maxima_7
0,CONTROL01,15-12-2021,DCHA,M,71,D,C:\Users\Usuario\Downloads\VideosParkinson\Vid...,20.139855,0,0,...,0.090729,0.136002,0.451339,1.663059,2.762289,3.90349,4.760591,5.200384,,0.667853
1,CONTROL01,15-12-2021,IZDA,M,71,D,C:\Users\Usuario\Downloads\VideosParkinson\Vid...,20.342861,1,0,...,0.136002,0.136002,0.380783,1.419083,2.237231,3.019232,3.724934,4.384708,,1.506679
2,CONTROL02,03-02-2022,DCHA,M,73,D,C:\Users\Usuario\Downloads\VideosParkinson\Vid...,19.95158,0,0,...,0.090729,0.136002,0.510201,1.354659,2.135135,2.891662,3.517762,4.090366,,0.423826
3,CONTROL02,03-02-2022,IZDA,M,73,D,C:\Users\Usuario\Downloads\VideosParkinson\Vid...,20.414286,0,0,...,0.170467,0.249958,0.890642,1.681355,2.850379,4.076822,5.025139,5.576625,,0.398549
4,CONTROL03,03-02-2022,DCHA,H,71,D,C:\Users\Usuario\Downloads\VideosParkinson\Vid...,7.859725,1,0,...,0.136002,0.181214,0.65537,1.659209,2.757113,3.899873,4.803792,5.43416,,0.422913


In [5]:


import matplotlib.pyplot as plt

def print_value_counts():
    """
    Prints the value counts for the 'lent' and 'amp' columns in the 'data' DataFrame.
    """
    value_counts_lent = data['lent'].value_counts()
    value_counts_amp = data['amp'].value_counts()

    fig, ax = plt.subplots(1, 2, figsize=(10, 4))
    ax[0].bar(value_counts_lent.index, value_counts_lent.values)
    ax[0].set_xlabel('Class')
    ax[0].set_ylabel('Instances')
    ax[0].set_title('Number of instances per class in "lent"')

    ax[1].bar(value_counts_amp.index, value_counts_amp.values)
    ax[1].set_xlabel('Class')
    ax[1].set_ylabel('Instances')
    ax[1].set_title('Number of instances per class in "amp"')

    plt.tight_layout()
    plt.show()

print_value_counts()

In [6]:
def preprocess_data(data):
    """
    Preprocesses the given data by removing rows and columns based on certain conditions.

    Args:
        data (pandas.DataFrame): The input data to be preprocessed.

    Returns:
        pandas.DataFrame: The preprocessed data.
    """
    index_to_remove = data[(data['lent'] == 4) | (data['amp'] == 4) | (data['age'] == 'XX')].index
    data = data.drop(index_to_remove)
    data = data[data['detection_time'] >= 15]
    columns_to_remove = ['sample_name', 'date', 'video_path','detection_time']
    data = data.drop(columns=columns_to_remove)
    return data
data=preprocess_data(data)

In [7]:
index_to_remove=data[(data['lent']==3)].index
data=data.drop(index_to_remove)

In [8]:
print_value_counts()

In [9]:
def encode_data(data):
    """
    Performs label encoding and data type conversion.

    Args:
        data (pandas.DataFrame): The input data to be preprocessed.

    Returns:
        pandas.DataFrame: The preprocessed data.
    """
    label_encoder = LabelEncoder()
    data['hand'] = label_encoder.fit_transform(data['hand'])
    data['gender'] = label_encoder.fit_transform(data['gender'])
    data['handedness'] = label_encoder.fit_transform(data['handedness'])
    data = data.drop(columns="angle__query_similarity_count__query_None__threshold_0.0")
    data['age'] = data['age'].astype(int)
    return data


In [10]:
data = encode_data(data)

In [11]:
data.head()

Unnamed: 0,hand,gender,age,handedness,lent,amp,angle__mean_speed,angle__frequency_of_maximums,angle__frequency_of_minimums,angle__average_of_maximums,...,angle__fourier_entropy__bins_3,angle__fourier_entropy__bins_5,angle__fourier_entropy__bins_10,angle__fourier_entropy__bins_100,angle__permutation_entropy__dimension_3__tau_1,angle__permutation_entropy__dimension_4__tau_1,angle__permutation_entropy__dimension_5__tau_1,angle__permutation_entropy__dimension_6__tau_1,angle__permutation_entropy__dimension_7__tau_1,angle__mean_n_absolute_max__number_of_maxima_7
0,0,1,71,0,0,0,0.842585,0.640393,0.591132,0.629751,...,0.090729,0.090729,0.136002,0.451339,1.663059,2.762289,3.90349,4.760591,5.200384,0.667853
1,1,1,71,0,1,0,5.537545,2.044839,2.135721,1.393629,...,0.090729,0.136002,0.136002,0.380783,1.419083,2.237231,3.019232,3.724934,4.384708,1.506679
2,0,1,73,0,0,0,1.251043,1.744251,1.695799,0.376918,...,0.090729,0.090729,0.136002,0.510201,1.354659,2.135135,2.891662,3.517762,4.090366,0.423826
3,1,1,73,0,0,0,0.771708,0.940298,0.940298,0.286869,...,0.079983,0.170467,0.249958,0.890642,1.681355,2.850379,4.076822,5.025139,5.576625,0.398549
5,1,0,71,0,1,0,0.820628,0.552925,0.552925,0.638055,...,0.045395,0.125256,0.125256,0.260704,1.598268,2.589804,3.590988,4.342525,4.828759,0.698242


# Model selection

## Slowness

In [None]:
import pickle as pkl
n_folds=5
random_state=42
k_neighbors=3
lent_data=data.drop(columns=['amp'])
X=lent_data.drop(columns=['lent'])
y=lent_data['lent']
all_results_smote_lent=[]
all_results_ros_lent=[]
models = {
    "svm_linear": SVC,
    "xgboost":XGBClassifier,
    "knn": KNeighborsClassifier,
    "rf": RandomForestClassifier
}
smote = SMOTE(random_state=random_state,k_neighbors=k_neighbors)
ros = RandomOverSampler(random_state=random_state)
skf=StratifiedKFold(n_splits=n_folds)
accuracy_scorer = make_scorer(accuracy_score)
f1_scorer = make_scorer(f1_score, average="weighted")
gmean_scorer = make_scorer(geometric_mean_score, average='weighted')
index_names=X.columns[SelectKBest(k=320).fit(X, y).get_support(indices=True)]
X=SelectKBest(k=320).fit_transform(X, y)
X=pd.DataFrame(X,columns=index_names)
i=0
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    X_res, y_res = ros.fit_resample(X_train, y_train)
    X_res_s, y_res_s = smote.fit_resample(X_train, y_train)
    for model_name, model in models.items():
        model_param_grid = parse_hyper_parameters(model_parameter_rules[model])
        grid = GridSearchCV(
            estimator=model(),
            param_grid=model_param_grid,
            scoring={
                "f1":f1_scorer,
            },
            refit="f1",
            n_jobs=10,
            cv=StratifiedKFold(n_splits=2),
            verbose=0
        )
        print(f"Doing dataset: full, model: {model_name}, features: 320")
        grid.fit(X_res, y_res)
        best_estimator = grid.best_estimator_
        with open(f"{model_name}_ros_lent_{i}.pkl", "wb") as f:
            pkl.dump(best_estimator, f)
        accuracy = accuracy_scorer(best_estimator,X_test, y_test)
        f1 = f1_scorer(best_estimator,X_test, y_test)
        gmean = gmean_scorer(best_estimator, X_test, y_test)
        
        all_results_ros_lent.append({
            "dataset": "full",
            "model": model_name,
            "features": 320,
            "accuracy": accuracy,
            "f1": f1,
            "gmean": gmean,
            "parameters": grid.best_params_
        })
        grid.fit(X_res_s, y_res_s)
        best_estimator = grid.best_estimator_
        with open(f"{model_name}_smote_lent_{i}.pkl", "wb") as f:
            pkl.dump(best_estimator, f)
        accuracy = accuracy_scorer(best_estimator,X_test, y_test)
        f1 = f1_scorer(best_estimator,X_test, y_test)
        gmean = gmean_scorer(best_estimator, X_test, y_test)
        all_results_smote_lent.append({
            "dataset": "full",
            "model": model_name,
            "features": 320,
            "accuracy": accuracy,
            "f1": f1,
            "gmean": gmean,
            "parameters": grid.best_params_
        })
    i+=1

[  2   3   4   5   6   8  10  17  24  29  34  35  43  47  49  50  54  55
  57  58  59  78  82  96 104 105 106 107 108 112 115 117 118 119 120 122
 123 124 126 127 129 130 131 133 134 135 137 138 139 141 152 156 162 166
 170 182 186 190 194 206 207 209 211 212 213 216 218 225 227 229 230 231
 233 234 235 241 243 245 246 247 249 250 251 253 255 257 259 261 263 264
 272 273 275 277 280 281 284 285 286 288 290 294 297 308 310 312 316 318
 319 322 325 326 327 328 329 334 337 339 342 343 345 346 348 354 356 362
 365 366 369 370 372 373 377 378 379 380 381 382 383 384 386 387 388 390
 392 393 394 397 398 399 402 403 407 408 409 413 414 415 416 417 420 424
 425 427 428 429 430 431 432 433 434 435 436 438 440 444 445 447 448 450
 453 454 459 460 461 467 472 473 475 477 478 481 482 484 486 487 488 491
 493 497 499 500 501 502 506 512 514 515 517 518 525 526 527 528 529 530
 532 534 535 539 543 544 545 546 547 550 551 553 554 570 577 580 582 585
 586 590 591 596 597 602 606 608 610 612 613 615 62

In [15]:
df = pd.DataFrame(all_results_smote_lent)
df.to_csv("./all_results_smote_lent.csv", index=False)
df = pd.DataFrame(all_results_ros_lent)
df.to_csv("./all_results_ros_lent.csv", index=False)

## Amplitude

In [18]:
n_folds=5
random_state=42
k_neighbors=3
amp_data=data.drop(columns=['lent'])
X=amp_data.drop(columns=['amp'])
y=amp_data['amp']
all_results_smote_amp=[]
all_results_ros_amp=[]
models = {
    "svm_linear": SVC,
    "xgboost":XGBClassifier,
    "knn": KNeighborsClassifier,
    "rf": RandomForestClassifier
}
smote = SMOTE(random_state=random_state,k_neighbors=k_neighbors)
ros = RandomOverSampler(random_state=random_state)
skf=StratifiedKFold(n_splits=n_folds)
accuracy_scorer = make_scorer(accuracy_score)
f1_scorer = make_scorer(f1_score, average="weighted")
gmean_scorer = make_scorer(geometric_mean_score, average='weighted')
index_names=X.columns[SelectKBest(k=320).fit(X, y).get_support(indices=True)]
X=SelectKBest(k=320).fit_transform(X, y)
X=pd.DataFrame(X,columns=index_names)
i=0
for train_index, test_index in skf.split(X, y):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    X_res, y_res = ros.fit_resample(X_train, y_train)
    X_res_s, y_res_s = smote.fit_resample(X_train, y_train)
    for model_name, model in models.items():
        model_param_grid = parse_hyper_parameters(model_parameter_rules[model])
        grid = GridSearchCV(
            estimator=model(),
            param_grid=model_param_grid,
            scoring={
                "f1":f1_scorer,
            },
            refit="f1",
            n_jobs=10,
            cv=StratifiedKFold(n_splits=2),
            verbose=3
        )
        print(f"Doing dataset: full, model: {model_name}, features: 320")
        grid.fit(X_res, y_res)
        best_estimator = grid.best_estimator_        
        with open(f"{model_name}_ros_amp_{i}.pkl", "wb") as f:
            pkl.dump(best_estimator, f)

        accuracy = accuracy_scorer(best_estimator,X_test, y_test)
        f1 = f1_scorer(best_estimator,X_test, y_test)
        gmean = gmean_scorer(best_estimator, X_test, y_test)
        
        all_results_ros_amp.append({
            "dataset": "full",
            "model": model_name,
            "features": 320,
            "accuracy": accuracy,
            "f1": f1,
            "gmean": gmean,
            "parameters": grid.best_params_
        })
        grid.fit(X_res_s, y_res_s)
        best_estimator = grid.best_estimator_
        with open(f"{model_name}_smote_amp_{i}.pkl", "wb") as f:
            pkl.dump(best_estimator, f)
        accuracy = accuracy_scorer(best_estimator,X_test, y_test)
        f1 = f1_scorer(best_estimator,X_test, y_test)
        gmean = gmean_scorer(best_estimator, X_test, y_test)
        all_results_smote_amp.append({
            "dataset": "full",
            "model": model_name,
            "features": 320,
            "accuracy": accuracy,
            "f1": f1,
            "gmean": gmean,
            "parameters": grid.best_params_
        })
    i+=1

[  1   3  10  18  24  34  35  36  37  48  59  60  78  79  81  95  96  97
 110 111 112 121 122 123 132 134 135 136 137 139 140 143 144 146 147 148
 150 151 152 154 155 156 158 159 160 162 163 166 167 170 174 178 179 180
 182 183 184 186 187 188 190 191 192 195 196 199 200 204 209 210 211 213
 216 218 219 221 222 223 231 232 235 238 239 247 277 279 282 287 289 290
 294 295 297 298 300 301 307 308 309 310 312 314 316 318 319 321 322 324
 327 328 329 331 332 333 335 337 339 340 341 342 343 344 347 348 349 353
 354 355 356 357 362 363 365 366 369 371 372 373 375 378 380 381 383 385
 387 388 394 398 400 401 402 404 405 407 409 410 411 412 414 415 416 417
 418 419 420 422 424 425 426 427 430 437 438 439 440 441 442 443 444 445
 446 448 449 450 451 452 453 455 459 461 462 463 465 467 468 469 471 472
 473 475 497 498 500 501 508 509 510 512 518 519 520 521 522 525 527 528
 529 530 532 540 541 542 544 545 546 554 555 560 563 564 565 566 571 572
 574 577 578 581 585 586 588 589 591 593 595 597 59

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(all_results_smote_amp)
df.to_csv("./all_results_smote_amp.csv", index=False)
df = pd.DataFrame(all_results_ros_amp)
df.to_csv("./all_results_ros_amp.csv", index=False)