In [30]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import sys
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    classification_report, 
    confusion_matrix
)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE

In [2]:
# Set the known parent package
parent_dir = str(Path().resolve().parents[0])
sys.path.insert(0, parent_dir)

from data_loader import (
    data_loader,
    binned_distr,
    dev_mad_var,
    features_accelerometer,
    features_cosine,
    features_freq,
    features_temporal,
    vector_magnitude
)


In [None]:
data_processor = data_loader.TimeWindowSegmenter(
    df_path="../data_loader/real_world_2016.parquet",
    window_size=10,
    step_size=4,
    source_sampling_rate=50,
    time_column="timestamp",
    id_column='person_id',
    activity_column='activity_label',
    clean_columns=False,
    fix_timestamps=False,
    acc_columns=('acc_x', 'acc_y', 'acc_z'),
    gyr_columns=('gyr_x', 'gyr_y', 'gyr_z')
)
data_processor.resample_to(25)

Resampling: 100%|██████████| 71/71 [00:03<00:00, 22.07it/s]


In [10]:
def extract_features_from_window(window, fs=20, axes = ['ac_x', 'ac_y', 'ac_z', 'g_x', 'g_y', 'g_z']):
    features = []
    feature_names = []
    data_freq_dict = {}
    
    freq_funcs = [
        ('dom_freq', features_freq.dominant_frequency, [fs]),    
        ('entropy', features_freq.spectral_entropy, [fs]),       
        ('energy', features_freq.spectral_energy, []),           
        ('centroid', features_freq.spectral_centroid, [fs]),     
        ('bandwidth', features_freq.spectral_bandwidth, [fs]),   
        ('flatness', features_freq.spectral_flatness, [fs]),     
        ('slope', features_freq.spectral_slope, [fs]),           
        ('rolloff', features_freq.spectral_rolloff, [fs]),       
        ('band_ratio', features_freq.band_energy_ratio, [fs])    
    ]
    
    for axis in axes:
        signal = window[axis].astype(float).values
        for fname, func, extra_args in freq_funcs:
            data_freq_dict[f"{axis}_{fname}"] = func(signal, *extra_args)
            # features.append(func(signal, *extra_args))
            # feature_names.append(f"{axis}_{fname}")

    # rozkład wartości względnie dla okna
    data_binned_all_dict = binned_distr.calculate_binned_distribution_multi_axis(
        window=window, 
        bins=10,
        axes=axes
    )

    data_binned_sep_dict = {f"{key}_bin{bin_id}":data for key, items in data_binned_all_dict.items() for bin_id, data in enumerate(items)}

    dev_mad_var_dict = dev_mad_var.calculate_statistics_multi_axis(
        window=window,
        axes=axes
    )
    
    acc_features_dict = features_accelerometer.extract_acc_features(
        window=window,
        axes=axes
    )

    cosine_features_dict = features_cosine.extract_cosine_distances(
        window=window,
        axes=axes
    )

    temporal_features_dict = features_temporal.extract_temporal_features(
        window=window,
        axes=axes[3:]
    )



    # features.extend([val for axis in axes for _, val in enumerate(data_binned_distr[axis])])
    # feature_names.extend([f"{axis}_bin{i}" for axis in axes for i in range(len(data_binned_distr[axis]))])


    
    return {**data_freq_dict, **data_binned_sep_dict, **dev_mad_var_dict, **acc_features_dict, **cosine_features_dict, **temporal_features_dict}

In [None]:
X = []
Y =[]
feature_names = None
for window in data_processor.segment():
        results = extract_features_from_window(
                window,
                fs=50,
                axes=['acc_x', 'acc_y', 'acc_z', 'gyr_x', 'gyr_y', 'gyr_z']
        )
        label = window['activity_label'].mode()[0]
        X.append(results)
        Y.append(label)

Segmenting:  99%|█████████▊| 71/72 [00:21<00:00,  3.34it/s]


In [13]:
X_df = pd.DataFrame(X)
X_df

Unnamed: 0,acc_x_dom_freq,acc_x_entropy,acc_x_energy,acc_x_centroid,acc_x_bandwidth,acc_x_flatness,acc_x_slope,acc_x_rolloff,acc_x_band_ratio,acc_y_dom_freq,...,gyr_y_range,gyr_y_autocorr_lag1,gyr_y_autocorr_lag5,gyr_z_zero_crossings,gyr_z_mean_crossings,gyr_z_num_peaks,gyr_z_range,gyr_z_autocorr_lag1,gyr_z_autocorr_lag5,sma
0,3.3,0.715926,52948.063488,5.311276,3.981928,0.138254,-0.977033,9.9,5.854645,1.6,...,4.126660,0.936481,0.136693,43,43,49,6.477590,0.956728,0.305866,2.637439
1,3.5,0.588386,52555.151968,4.574068,2.805937,0.080343,-0.826095,6.8,18.146014,1.8,...,4.005430,0.927641,0.041774,37,39,41,6.451955,0.958108,0.311447,2.589056
2,3.3,0.656491,52690.550445,4.624613,3.428784,0.131551,-0.777125,6.9,14.700711,1.7,...,4.062843,0.927724,0.100439,39,41,41,6.325915,0.955537,0.285241,2.574295
3,3.4,0.684678,54617.639974,5.287656,4.279494,0.188019,-0.647978,8.8,6.811432,1.9,...,4.626005,0.939725,0.177693,37,37,37,9.329717,0.959250,0.296961,2.996269
4,3.0,0.503886,52631.762008,4.171667,2.653944,0.059164,-0.945920,6.0,23.455669,4.5,...,4.526140,0.929380,0.037207,44,44,44,9.329717,0.953096,0.277598,2.591898
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3164,3.4,0.501046,51513.340738,4.531235,3.033318,0.085431,-0.788842,6.7,9.885792,6.7,...,3.695947,0.811790,-0.051389,70,67,56,4.414525,0.900158,-0.008161,1.782481
3165,3.3,0.580947,49383.512812,4.081853,2.858195,0.091712,-0.962691,5.7,18.146395,0.3,...,4.838297,0.884328,0.142806,57,59,46,4.682080,0.907147,-0.012444,1.922017
3166,3.3,0.554928,47442.570667,3.754458,2.657276,0.072481,-0.982696,4.0,18.783516,1.6,...,4.491955,0.912068,0.202670,48,48,42,4.682080,0.915524,-0.012925,1.731141
3167,3.3,0.477110,47579.849495,3.888039,2.976070,0.061780,-0.779773,4.4,14.165261,0.1,...,4.231875,0.880443,0.087692,49,49,42,4.571535,0.926252,0.055473,1.787290


In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(
    X_df, Y,             # data frame razem z cechami i etykietami
    test_size=0.2,       # 20% walidacja
    random_state=42,     # dla powtarzalności
    stratify=Y           # zachowanie proporcji klas
)

# utworzenie i trenowanie modelu
model = SVC(kernel='rbf', C=1.0, gamma='scale')
model.fit(X_train, Y_train)
Y_pred = model.predict(X_val)
# Obliczenie dokładności
accuracy = accuracy_score(Y_val, Y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Wyświetlenie macierzy konfuzji
conf_matrix = confusion_matrix(Y_val, Y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Wyświetlenie szczegółowego raportu klasyfikacji
class_report = classification_report(Y_val, Y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.77
Confusion Matrix:
[[  5   0  10  13  63]
 [  1 119   5  16   3]
 [  0   0 131   2   0]
 [  0   0  26 105   0]
 [  0   0   1   3 131]]
Classification Report:
              precision    recall  f1-score   support

  climbingup       0.83      0.05      0.10        91
     running       1.00      0.83      0.90       144
     sitting       0.76      0.98      0.86       133
    standing       0.76      0.80      0.78       131
     walking       0.66      0.97      0.79       135

    accuracy                           0.77       634
   macro avg       0.80      0.73      0.69       634
weighted avg       0.80      0.77      0.73       634



In [33]:
def evaluate_feature_importance_cv(X_df, Y, step=5, min_features=5, cv=5):
    """
    Testuje wpływ cech na klasyfikację za pomocą RFE i walidacji krzyżowej.
    
    Argumenty:
    - X_df: DataFrame z cechami
    - Y: etykiety
    - step: liczba cech usuwanych w jednej iteracji RFE
    - min_features: minimalna liczba cech do rozważenia
    - cv: liczba foldów walidacji krzyżowej
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_df)
    model = SVC(kernel='linear', C=1.0, gamma='scale')

    results = []
    feature_names = np.array(X_df.columns)

    print("Testowanie różnych podzbiorów cech z użyciem cross-validation...\n")

    for n_features in tqdm(range(X_scaled.shape[1], min_features - 1, -step)):
        selector = RFE(estimator=model, n_features_to_select=n_features, step=step)
        X_selected = selector.fit_transform(X_scaled, Y)

        scores = cross_val_score(model, X_selected, Y, cv=StratifiedKFold(n_splits=cv), scoring='accuracy')
        mean_acc = scores.mean()

        selected_features = feature_names[selector.support_].tolist()

        results.append({
            'num_features': n_features,
            'accuracy': mean_acc,
            'selected_features': selected_features
        })

    results_sorted = sorted(results, key=lambda x: x['accuracy'], reverse=True)

    print("\nTop wyniki:")
    for res in results_sorted[:3]:
        print(f"\nCech: {res['num_features']}, Accuracy: {res['accuracy']:.4f}")
        print("Wybrane cechy:", res['selected_features'])

    return results_sorted

In [34]:
results = evaluate_feature_importance_cv(X_df, Y, step=3, min_features=30, cv=5)

Testowanie różnych podzbiorów cech z użyciem cross-validation...



100%|██████████| 61/61 [06:16<00:00,  6.18s/it]


Top wyniki:

Cech: 49, Accuracy: 0.8637
Wybrane cechy: ['acc_x_energy', 'acc_x_centroid', 'acc_x_bandwidth', 'acc_x_flatness', 'acc_y_energy', 'acc_z_energy', 'acc_z_centroid', 'acc_z_bandwidth', 'acc_z_flatness', 'acc_z_rolloff', 'gyr_x_energy', 'gyr_z_energy', 'gyr_z_bandwidth', 'gyr_z_flatness', 'binned_acc_y_bin0', 'abs_acc_x', 'var_acc_x', 'abs_acc_y', 'var_acc_y', 'abs_gyr_x', 'var_gyr_x', 'abs_gyr_z', 'var_gyr_z', 'acc_x_abs_sum', 'acc_x_jerk_mean', 'acc_x_jerk_std', 'acc_x_jerk_max', 'acc_y_abs_sum', 'acc_y_jerk_mean', 'acc_y_jerk_std', 'acc_y_jerk_max', 'acc_z_mean', 'acc_z_rms', 'acc_z_jerk_mean', 'acc_z_jerk_std', 'gyr_x_abs_sum', 'gyr_y_jerk_std', 'gyr_z_abs_sum', 'gyr_z_jerk_mean', 'gyr_z_jerk_std', 'gyr_z_jerk_max', 'gyr_x_zero_crossings', 'gyr_x_num_peaks', 'gyr_y_zero_crossings', 'gyr_y_num_peaks', 'gyr_y_autocorr_lag1', 'gyr_z_zero_crossings', 'gyr_z_num_peaks', 'sma']

Cech: 40, Accuracy: 0.8621
Wybrane cechy: ['acc_x_energy', 'acc_x_bandwidth', 'acc_x_flatness', 'ac


