# SVC

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from scipy import sparse

import time


In [2]:
def load_data(file_path):
    df = pd.read_parquet(file_path)
    return df

In [3]:
df = load_data('Participants_all.parquet')

#SEX_MAP = {0: 'M', 1: 'F'}
#AGE_MAP = {0: '18-29', 1: '30-37', 2: '38-52', 3: '53+'}

#df["age_group"] = df["age_group"].map(AGE_MAP)
#df["sex"] = df["sex"].map(SEX_MAP)

df = df.drop(columns=["age_group", "sex"])               
               
df = df.fillna(0)


In [4]:
df.head()

Unnamed: 0,pid,window_start,window_end,n_samples,duration_seconds,label:Walmsley2020,label:Walmsley2020_enc,label:WillettsSpecific2018,label:WillettsSpecific2018_enc,label:WillettsMET2018,...,energy_z,energy_total,magnitude_mean,corr_xy,corr_xz,corr_yz,fft_dom_freq,fft_peak_power,hour_sin,hour_cos
0,P001,2016-11-13 02:18:00,2016-11-13 02:18:05,500,4.99,sleep,3,sleep,5,sleep,...,0.432345,0.940507,0.969787,-0.14848,-0.077644,0.275487,0.0,0.0,0.566406,0.824126
1,P001,2016-11-13 02:18:05,2016-11-13 02:18:10,500,4.99,sleep,3,sleep,5,sleep,...,0.432585,0.942055,0.970582,-0.108382,-0.028882,0.137541,0.0,0.0,0.566706,0.82392
2,P001,2016-11-13 02:18:10,2016-11-13 02:18:15,500,4.99,sleep,3,sleep,5,sleep,...,0.432665,0.942753,0.970944,-0.260468,-0.079268,0.204062,0.0,0.0,0.567005,0.823714
3,P001,2016-11-13 02:18:15,2016-11-13 02:18:20,500,4.99,sleep,3,sleep,5,sleep,...,0.432626,0.941857,0.970483,-0.243211,-0.092415,0.223157,0.0,0.0,0.567305,0.823508
4,P001,2016-11-13 02:18:20,2016-11-13 02:18:25,500,4.99,sleep,3,sleep,5,sleep,...,0.433269,0.941597,0.97035,-0.225457,-0.07925,0.230302,0.0,0.0,0.567604,0.823302


In [5]:
target = "label:WillettsMET2018_enc"

In [6]:
def target_selection(df, target):
    y = df[target]
    return y

In [7]:
y = target_selection(df, target)
y.value_counts()

label:WillettsMET2018_enc
5     680172
3     534573
2     297630
9     111731
10     80563
8      69478
4      37521
0      17933
6       7251
1       3714
7       2985
Name: count, dtype: int64

In [8]:
def feature_dropper(df, features_to_drop, target):
    target = [target]
    X = df.drop(columns=features_to_drop + target)
    return X

In [9]:
features_to_drop = ['pid', 'window_start', 'window_end',
'n_samples', 'duration_seconds', 'label:Walmsley2020',
'label:Walmsley2020_enc', 'label:WillettsSpecific2018', 'label:WillettsSpecific2018_enc', 'label:WillettsMET2018']

In [10]:
X = feature_dropper(df, features_to_drop=features_to_drop, target=target)

In [11]:
X.head()

Unnamed: 0,x_mean,x_std,x_min,x_max,y_mean,y_std,y_min,y_max,z_mean,z_std,...,energy_z,energy_total,magnitude_mean,corr_xy,corr_xz,corr_yz,fft_dom_freq,fft_peak_power,hour_sin,hour_cos
0,-0.468161,0.004565,-0.482334,-0.46669,-0.537512,0.006892,-0.548902,-0.533341,0.657518,0.00396,...,0.432345,0.940507,0.969787,-0.14848,-0.077644,0.275487,0.0,0.0,0.566406,0.824126
1,-0.470069,0.006437,-0.482334,-0.46669,-0.537045,0.006771,-0.548902,-0.51778,0.657702,0.003627,...,0.432585,0.942055,0.970582,-0.108382,-0.028882,0.137541,0.0,0.0,0.566706,0.82392
2,-0.469694,0.006162,-0.482334,-0.46669,-0.537947,0.007104,-0.548902,-0.533341,0.657764,0.003369,...,0.432665,0.942753,0.970944,-0.260468,-0.079268,0.204062,0.0,0.0,0.567005,0.823714
3,-0.469287,0.005821,-0.482334,-0.46669,-0.537512,0.006962,-0.548902,-0.51778,0.657733,0.003567,...,0.432626,0.941857,0.970483,-0.243211,-0.092415,0.223157,0.0,0.0,0.567305,0.823508
4,-0.47082,0.006896,-0.482334,-0.46669,-0.535333,0.005291,-0.548902,-0.51778,0.658226,0.002743,...,0.433269,0.941597,0.97035,-0.225457,-0.07925,0.230302,0.0,0.0,0.567604,0.823302


In [12]:
X.shape

(1843551, 24)

In [13]:
X.tail()

Unnamed: 0,x_mean,x_std,x_min,x_max,y_mean,y_std,y_min,y_max,z_mean,z_std,...,energy_z,energy_total,magnitude_mean,corr_xy,corr_xz,corr_yz,fft_dom_freq,fft_peak_power,hour_sin,hour_cos
1843546,0.29197,0.002057,0.276779,0.292248,-0.470383,0.007831,-0.478936,-0.463214,0.833149,0.007519,...,0.694194,1.000766,1.000355,-0.003141,-0.028912,-0.063735,0.0,0.0,0.996714,-0.080996
1843547,0.292124,0.001378,0.276779,0.292248,-0.470383,0.007831,-0.478936,-0.463214,0.834082,0.007094,...,0.695744,1.002404,1.001177,-0.082219,0.04808,0.02558,0.0,0.0,0.996685,-0.081359
1843548,0.292001,0.001941,0.276779,0.292248,-0.470509,0.007841,-0.478936,-0.463214,0.833825,0.007226,...,0.695316,1.002025,1.000987,0.009205,0.02698,-0.008575,0.0,0.0,0.996655,-0.081721
1843549,0.292155,0.001195,0.276779,0.292248,-0.470824,0.007857,-0.478936,-0.463214,0.833117,0.007532,...,0.69414,1.001233,1.000589,-0.075245,0.056886,-0.029143,0.0,0.0,0.996625,-0.082083
1843550,0.292093,0.001824,0.276779,0.307717,-0.470729,0.007853,-0.478936,-0.463214,0.833406,0.007416,...,0.694621,1.001591,1.000769,-0.047205,0.054113,-0.001164,0.0,0.0,0.996596,-0.082446


### Train/Test Split

In [14]:
def split(X, y, method="Holdout"):
    if method == "Holdout":
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=0.3,
            random_state=42,
            stratify=y
        )
    # if method == "Participant":
    # Implement participant-based splitting logic here

    return X_train, X_test, y_train, y_test

In [15]:
X_train, X_test, y_train, y_test = split(X, y)

### Preprocessing Pipeline

In [16]:
def preprocessing_pipeline(X_train):

    X_proc = X_train.copy()

    numeric_features = X_proc.select_dtypes(include=['number']).columns.tolist()
    #categorical_features = X_proc.select_dtypes(include=['object', 'category']).columns.tolist()

    numeric_transformer = Pipeline(steps=[
        # with_mean=False porque vamos trabalhar com sparse depois do OneHot
        ('scaler', StandardScaler(with_mean=False))
    ])

    #categorical_transformer = Pipeline(steps=[
    #    ('onehot', OneHotEncoder(
    #        handle_unknown='ignore',
    #        sparse_output=True
    #    ))
    #])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features)
    #       ('cat', categorical_transformer, categorical_features)
        ]
    )

    # Fit só no train
    preprocessor.fit(X_proc)

    # Nomes das features numéricas são os próprios
    num_cols_out = numeric_features

    # Nomes das dummies
    #if categorical_features:
    #    ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
    #    cat_cols_out = ohe.get_feature_names_out(categorical_features).tolist()
    #else:
    cat_cols_out = []

    feature_names = num_cols_out + cat_cols_out

    return preprocessor, feature_names

In [17]:
preprocessor, feature_names = preprocessing_pipeline(X_train)

In [18]:
def apply_preprocessor(preprocessor, X, feature_names, as_dataframe=False):
    """
    Transforma X com o preprocessor.
    Se as_dataframe=True, converte para DataFrame (cuidado com 1M linhas!).
    """
    X_trans = preprocessor.transform(X)

    if not as_dataframe:
        # retorna sparse/ndarray direto -> mais leve
        return X_trans

    # Se for sparse, converte antes
    if sparse.issparse(X_trans):
        X_dense = X_trans.toarray()
    else:
        X_dense = X_trans

    return pd.DataFrame(X_dense, columns=feature_names, index=X.index)

In [19]:
X_train_pre = apply_preprocessor(preprocessor, X_train, feature_names, as_dataframe=False)
X_test_pre  = apply_preprocessor(preprocessor, X_test, feature_names, as_dataframe=False)

### Feature Selection

In [20]:
def feature_selection(X_train_pre, y_train, feature_names=None,
                      threshold="median", C=0.5):
    """
    Seleciona features usando LogisticRegression L1 + SelectFromModel.

    X_train_pre: matriz já preprocessada (sparse ou densa).
    y_train: alvo de treino.
    feature_names: lista de nomes das colunas na mesma ordem de X_train_pre.
                   Se None e X_train_pre for DataFrame, usa .columns.
    threshold: "median", "mean" ou valor numérico.
    C: força da regularização (menor C -> mais sparsidade).
    """

    l1_lr = LogisticRegression(
        penalty="l1",
        solver="liblinear",
        C=C,
        max_iter=1000
    )

    selector = SelectFromModel(
        estimator=l1_lr,
        threshold=threshold,
        prefit=False
    )

    start = time.time()
    print("Starting feature selection...") 
    
    selector.fit(X_train_pre, y_train)
    
    print("Finished in", time.time() - start, "seconds")

    mask = selector.get_support()

    # Resolver nomes das features
    if feature_names is None and hasattr(X_train_pre, "columns"):
        feature_names = X_train_pre.columns.tolist()

    if feature_names is not None:
        feature_names = np.array(feature_names)
        selected_features = feature_names[mask].tolist()
    else:
        selected_features = None  # você perde os nomes se não passar

    return selector, selected_features


In [21]:
#print("Running selector...") 
#selector, selected_features = feature_selection(
#    X_train_pre, y_train,
#    feature_names=feature_names,
#    threshold="median",
#    C=0.5
#)

#print("Selector finished")
#X_train_sel = selector.transform(X_train_pre)
#X_test_sel  = selector.transform(X_test_pre)

### Model Structure

In [22]:
def sampling(X_train_pre, y_train, n_samples=5000):
    """
    Randomly sample n_samples from preprocessed training data.
    Works with numpy arrays, pandas DataFrames, and sparse matrices.
    """

    n_total = X_train_pre.shape[0]

    if n_samples > n_total:
        n_samples = n_total

    # choose indices
    sampled_indices = np.random.choice(n_total, n_samples, replace=False)

    # extract rows
    if sparse.issparse(X_train_pre):
        X_train_sampled = X_train_pre[sampled_indices]
    else:
        # handles numpy arrays AND DataFrames
        X_train_sampled = X_train_pre.iloc[sampled_indices] if hasattr(X_train_pre, "iloc") else X_train_pre[sampled_indices]

    # extract labels
    y_train_sampled = y_train.iloc[sampled_indices] if hasattr(y_train, "iloc") else y_train[sampled_indices]

    print(f"Sampled: {n_samples:,} rows (from {n_total:,})")
    print("Sampled X shape:", X_train_sampled.shape)

    return X_train_sampled, y_train_sampled

In [23]:
X_train_sampled, y_train_sampled = sampling(X_train_pre = X_train_pre, y_train = y_train, n_samples=300000)

Sampled: 300,000 rows (from 1,290,485)
Sampled X shape: (300000, 24)


In [24]:
def training_svc(X_train_sampled, y_train, X_test_pre, y_test,
                 C=1.0, kernel='rbf', gamma='scale'):

    svc = SVC(C=C, kernel=kernel, gamma=gamma, class_weight="balanced")

    print("Training SVC model...")
    svc.fit(X_train_sampled, y_train_sampled)
    print("Done!")

    y_pred = svc.predict(X_test_pre)

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    return svc

In [None]:
svc = training_svc(X_train_sampled, y_train_sampled, X_test_pre, y_test)

Training SVC model...
Done!

Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.80      0.54      5380
           1       0.05      0.51      0.09      1114
           2       0.51      0.22      0.31     89289
           3       0.78      0.39      0.52    160372
           4       0.14      0.66      0.23     11256
           5       0.98      0.96      0.97    204052
           6       0.14      0.67      0.24      2175
           7       0.01      0.58      0.03       896
           8       0.24      0.40      0.30     20844
           9       0.63      0.40      0.49     33519
          10       0.20      0.44      0.28     24169

    accuracy                           0.59    553066
   macro avg       0.37      0.55      0.36    553066
weighted avg       0.73      0.59      0.62    553066


Confusion Matrix:
[[  4293     75    148    100     44      7     88     28    195     77
     325]
 [    32    567     49     55     