In [1]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import sys
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    classification_report, 
    confusion_matrix
)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE

In [2]:
# Set the known parent package
parent_dir = str(Path().resolve().parents[0])
sys.path.insert(0, parent_dir)

from data_loader import (
    data_loader,
    binned_distr,
    dev_mad_var,
    features_accelerometer,
    features_cosine,
    features_freq,
    features_temporal,
    vector_magnitude,
    peak_features,
)


In [3]:
data_processor = data_loader.TimeWindowSegmenter(
    df_path="../data_loader/real_world_2016.parquet",
    window_size=10,
    step_size=4,
    source_sampling_rate=50,
    time_column="timestamp",
    id_column='person_id',
    activity_column='activity_label',
    clean_columns=False,
    fix_timestamps=False,
    acc_columns=('acc_x', 'acc_y', 'acc_z'),
    gyr_columns=('gyr_x', 'gyr_y', 'gyr_z')
)
data_processor.resample_to(25)

Resampling: 100%|██████████| 71/71 [00:03<00:00, 21.74it/s]


In [6]:
def extract_features_from_window(window, fs=20, axes = ['ac_x', 'ac_y', 'ac_z', 'g_x', 'g_y', 'g_z']):
    features = []
    feature_names = []
    data_freq_dict = {}
    
    freq_funcs = [
        ('dom_freq', features_freq.dominant_frequency, [fs]),    
        ('entropy', features_freq.spectral_entropy, [fs]),       
        ('energy', features_freq.spectral_energy, []),           
        ('centroid', features_freq.spectral_centroid, [fs]),     
        ('bandwidth', features_freq.spectral_bandwidth, [fs]),   
        ('flatness', features_freq.spectral_flatness, [fs]),     
        ('slope', features_freq.spectral_slope, [fs]),           
        ('rolloff', features_freq.spectral_rolloff, [fs]),       
        ('band_ratio', features_freq.band_energy_ratio, [fs])    
    ]
    
    for axis in axes:
        signal = window[axis].astype(float).values
        for fname, func, extra_args in freq_funcs:
            data_freq_dict[f"{axis}_{fname}"] = func(signal, *extra_args)
            # features.append(func(signal, *extra_args))
            # feature_names.append(f"{axis}_{fname}")

    # rozkład wartości względnie dla okna
    data_binned_all_dict = binned_distr.calculate_binned_distribution_multi_axis(
        window=window, 
        bins=10,
        axes=axes
    )

    data_binned_sep_dict = {f"{key}_bin{bin_id}":data for key, items in data_binned_all_dict.items() for bin_id, data in enumerate(items)}

    dev_mad_var_dict = dev_mad_var.calculate_statistics_multi_axis(
        window=window,
        axes=axes
    )
    
    acc_features_dict = features_accelerometer.extract_acc_features(
        window=window,
        axes=axes
    )

    cosine_features_dict = features_cosine.extract_cosine_distances(
        window=window,
        axes=axes
    )

    temporal_features_dict = features_temporal.extract_temporal_features(
        window=window,
        axes=axes[3:]
    )

    vector_magnitude_dict = {}
    vector_magnitude_dict["vector_acc_mag"] = vector_magnitude.calculate_accelerometer_magnitude(
        window=window,
        axes=axes[:3]
    )
    vector_magnitude_dict["vector_gyr_mag"] = vector_magnitude.calculate_gyroscope_magnitude(
        window=window,
        axes=axes[3:]
    )

    peak_features_dict = peak_features.extract_peak_features(
        window_df=window,
        sampling_rate=fs,
        axes=axes
    )

    
    return {**data_freq_dict, **data_binned_sep_dict, **dev_mad_var_dict, **acc_features_dict, **cosine_features_dict, **temporal_features_dict, **vector_magnitude_dict, **peak_features_dict}

In [10]:
X = []
Y =[]
feature_names = None
for window in data_processor.segment():
        label = window['activity_label'].mode()[0]
        if label == "climbingup":
                continue
        results = extract_features_from_window(
                window,
                fs=50,
                axes=['acc_x', 'acc_y', 'acc_z', 'gyr_x', 'gyr_y', 'gyr_z']
        )
        
        X.append(results)
        Y.append(label)

Segmenting:  99%|█████████▊| 71/72 [00:34<00:00,  2.05it/s]


In [11]:
X_df = pd.DataFrame(X)
X_df

Unnamed: 0,acc_x_dom_freq,acc_x_entropy,acc_x_energy,acc_x_centroid,acc_x_bandwidth,acc_x_flatness,acc_x_slope,acc_x_rolloff,acc_x_band_ratio,acc_y_dom_freq,...,peak_count_acc_z,peak_avg_time_diff_gyr_x,peak_std_time_diff_gyr_x,peak_count_gyr_x,peak_avg_time_diff_gyr_y,peak_std_time_diff_gyr_y,peak_count_gyr_y,peak_avg_time_diff_gyr_z,peak_std_time_diff_gyr_z,peak_count_gyr_z
0,2.5,0.301576,14221.114947,2.996298,2.263453,0.037393,-0.593347,2.6,33.136844,5.0,...,85,0.085345,0.027210,117,0.154687,0.060853,65,0.161695,0.089313,60
1,2.5,0.298869,10621.561723,3.041746,2.468296,0.046304,-0.499171,2.5,45.799751,4.9,...,73,0.088036,0.028089,113,0.145970,0.055181,68,0.181111,0.108007,55
2,2.4,0.285303,10504.366442,3.064647,2.450723,0.030720,-0.553173,2.5,35.521586,4.8,...,67,0.090741,0.030963,109,0.141714,0.055317,71,0.189231,0.111938,53
3,2.4,0.307100,10653.840249,3.113361,2.593034,0.031651,-0.507251,2.5,33.738230,4.8,...,57,0.084444,0.030282,118,0.143768,0.055979,70,0.189804,0.110214,52
4,2.4,0.335691,12467.593422,3.054048,2.496021,0.042499,-0.750923,2.5,40.742747,4.7,...,62,0.081833,0.029552,121,0.145970,0.055397,68,0.205957,0.117919,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4748,3.3,0.562343,47774.734831,3.520897,2.827273,0.076215,-0.922476,3.9,19.308484,1.6,...,60,0.172364,0.237670,56,0.186275,0.226360,52,0.217209,0.253004,44
4749,3.3,0.580853,47585.537627,2.808030,2.950003,0.063462,-0.951745,3.5,23.797326,1.7,...,62,0.170690,0.233540,59,0.179630,0.221393,55,0.235610,0.255229,42
4750,3.4,0.393745,50074.012927,4.273206,2.927802,0.050638,-0.613791,5.1,11.163659,0.1,...,85,0.125333,0.076548,76,0.145000,0.061135,65,0.205000,0.080128,45
4751,3.4,0.567407,49134.988011,4.759545,3.359068,0.115590,-0.755875,8.2,7.585431,0.2,...,101,0.100000,0.103055,99,0.127273,0.114774,78,0.175636,0.139906,56


In [12]:
X_train, X_val, Y_train, Y_val = train_test_split(
    X_df, Y,             # data frame razem z cechami i etykietami
    test_size=0.2,       # 20% walidacja
    random_state=42,     # dla powtarzalności
    stratify=Y           # zachowanie proporcji klas
)

# utworzenie i trenowanie modelu
model = SVC(kernel='rbf', C=1.0, gamma='scale')
model.fit(X_train, Y_train)
Y_pred = model.predict(X_val)
# Obliczenie dokładności
accuracy = accuracy_score(Y_val, Y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Wyświetlenie macierzy konfuzji
conf_matrix = confusion_matrix(Y_val, Y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Wyświetlenie szczegółowego raportu klasyfikacji
class_report = classification_report(Y_val, Y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.90
Confusion Matrix:
[[212  12  22   5]
 [  0 228   5   0]
 [  0  34 195   2]
 [  0   7   7 222]]
Classification Report:
              precision    recall  f1-score   support

     running       1.00      0.84      0.92       251
     sitting       0.81      0.98      0.89       233
    standing       0.85      0.84      0.85       231
     walking       0.97      0.94      0.95       236

    accuracy                           0.90       951
   macro avg       0.91      0.90      0.90       951
weighted avg       0.91      0.90      0.90       951



In [14]:
import joblib

# Zapis modelu do pliku
joblib.dump(model, 'svc_model.pkl')

['svc_model.pkl']

In [13]:
def evaluate_feature_importance_cv(X_df, Y, step=5, min_features=5, cv=5):
    """
    Testuje wpływ cech na klasyfikację za pomocą RFE i walidacji krzyżowej.
    
    Argumenty:
    - X_df: DataFrame z cechami
    - Y: etykiety
    - step: liczba cech usuwanych w jednej iteracji RFE
    - min_features: minimalna liczba cech do rozważenia
    - cv: liczba foldów walidacji krzyżowej
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_df)
    model = SVC(kernel='linear', C=1.0, gamma='scale')

    results = []
    feature_names = np.array(X_df.columns)

    print("Testowanie różnych podzbiorów cech z użyciem cross-validation...\n")

    for n_features in tqdm(range(X_scaled.shape[1], min_features - 1, -step)):
        selector = RFE(estimator=model, n_features_to_select=n_features, step=step)
        X_selected = selector.fit_transform(X_scaled, Y)

        scores = cross_val_score(model, X_selected, Y, cv=StratifiedKFold(n_splits=cv), scoring='accuracy')
        mean_acc = scores.mean()

        selected_features = feature_names[selector.support_].tolist()

        results.append({
            'num_features': n_features,
            'accuracy': mean_acc,
            'selected_features': selected_features
        })

    results_sorted = sorted(results, key=lambda x: x['accuracy'], reverse=True)

    print("\nTop wyniki:")
    for res in results_sorted[:3]:
        print(f"\nCech: {res['num_features']}, Accuracy: {res['accuracy']:.4f}")
        print("Wybrane cechy:", res['selected_features'])

    return results_sorted, results

In [34]:
results = evaluate_feature_importance_cv(X_df, Y, step=3, min_features=30, cv=5)

Testowanie różnych podzbiorów cech z użyciem cross-validation...



100%|██████████| 61/61 [06:16<00:00,  6.18s/it]


Top wyniki:

Cech: 49, Accuracy: 0.8637
Wybrane cechy: ['acc_x_energy', 'acc_x_centroid', 'acc_x_bandwidth', 'acc_x_flatness', 'acc_y_energy', 'acc_z_energy', 'acc_z_centroid', 'acc_z_bandwidth', 'acc_z_flatness', 'acc_z_rolloff', 'gyr_x_energy', 'gyr_z_energy', 'gyr_z_bandwidth', 'gyr_z_flatness', 'binned_acc_y_bin0', 'abs_acc_x', 'var_acc_x', 'abs_acc_y', 'var_acc_y', 'abs_gyr_x', 'var_gyr_x', 'abs_gyr_z', 'var_gyr_z', 'acc_x_abs_sum', 'acc_x_jerk_mean', 'acc_x_jerk_std', 'acc_x_jerk_max', 'acc_y_abs_sum', 'acc_y_jerk_mean', 'acc_y_jerk_std', 'acc_y_jerk_max', 'acc_z_mean', 'acc_z_rms', 'acc_z_jerk_mean', 'acc_z_jerk_std', 'gyr_x_abs_sum', 'gyr_y_jerk_std', 'gyr_z_abs_sum', 'gyr_z_jerk_mean', 'gyr_z_jerk_std', 'gyr_z_jerk_max', 'gyr_x_zero_crossings', 'gyr_x_num_peaks', 'gyr_y_zero_crossings', 'gyr_y_num_peaks', 'gyr_y_autocorr_lag1', 'gyr_z_zero_crossings', 'gyr_z_num_peaks', 'sma']

Cech: 40, Accuracy: 0.8621
Wybrane cechy: ['acc_x_energy', 'acc_x_bandwidth', 'acc_x_flatness', 'ac




In [39]:
from collections import Counter
def get_top_features(results, top_n=5, top_k_features=20):
    sorted_results = sorted(results, key=lambda x: x['accuracy'], reverse=True)

    all_features = []
    for entry in sorted_results[:top_n]:
        all_features.extend(entry['selected_features'])

    feature_counts = Counter(all_features)

    most_common = feature_counts.most_common(top_k_features)

    return most_common



top_features = get_top_features(results, top_n=20, top_k_features=50)
print("Najczęściej występujące cechy w top wynikach:")
for feature, count in top_features:
    print(f"{feature}: {count} razy")

Najczęściej występujące cechy w top wynikach:
acc_x_energy: 20 razy
acc_x_bandwidth: 20 razy
acc_x_flatness: 20 razy
acc_y_energy: 20 razy
acc_z_energy: 20 razy
gyr_z_energy: 20 razy
gyr_z_bandwidth: 20 razy
gyr_z_flatness: 20 razy
var_acc_y: 20 razy
abs_gyr_x: 20 razy
var_gyr_x: 20 razy
var_gyr_z: 20 razy
acc_x_abs_sum: 20 razy
acc_x_jerk_mean: 20 razy
acc_x_jerk_std: 20 razy
acc_x_jerk_max: 20 razy
acc_y_abs_sum: 20 razy
acc_y_jerk_mean: 20 razy
acc_y_jerk_std: 20 razy
acc_z_mean: 20 razy
acc_z_rms: 20 razy
acc_z_jerk_mean: 20 razy
acc_z_jerk_std: 20 razy
gyr_x_abs_sum: 20 razy
gyr_y_jerk_std: 20 razy
gyr_z_abs_sum: 20 razy
gyr_z_jerk_mean: 20 razy
gyr_z_jerk_std: 20 razy
gyr_x_num_peaks: 20 razy
gyr_y_zero_crossings: 20 razy
sma: 20 razy
acc_z_centroid: 19 razy
abs_acc_y: 19 razy
abs_gyr_z: 19 razy
acc_z_flatness: 18 razy
gyr_z_jerk_max: 18 razy
gyr_y_autocorr_lag1: 18 razy
acc_z_rolloff: 17 razy
gyr_y_num_peaks: 17 razy
gyr_z_zero_crossings: 17 razy
acc_z_bandwidth: 16 razy
var_acc