In [16]:
from sklearn.model_selection import train_test_split
import random
import numpy as np
import pandas as pd
import openml
from generating_dataset import generate_dataset
from preprocessing import split, check_feature_sample_ratio, delete_corr_columns, fill_na, assign_target_column, add_permuted_columns, drop_categorical

In [17]:
"""
Those are helper functions and not final tests
"""
from CCD_implementation import logisitic_regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score


def fit_our_model(X_train, y_train, X_test, y_test):  
    model_own = logisitic_regression()
    # model_own.fit(X_train, y_train,a=1.0, weights=False, user_lambda=None)
    model_own.fit(X_train, y_train,a=1.0, weights=False, user_lambda=0)

    y_pred_own = model_own.predict(X_test)
    accuracy_own = accuracy_score(y_test, y_pred_own)
    f1_own = f1_score(y_test, y_pred_own)
    print(f"Test accuracy for our model lambda 0: {accuracy_own:.4f}, F1-score: {f1_own:.4f}")
    return y_pred_own

def fit_our_model_l1(X_train, y_train, X_test, y_test):  
    model_own = logisitic_regression()
    model_own.fit(X_train, y_train,a=1.0, weights=False, user_lambda=0.05)
    # model_own.fit(X_train, y_train,a=1.0, weights=False, user_lambda=[0])

    y_pred_own = model_own.predict(X_test)
    accuracy_own = accuracy_score(y_test, y_pred_own)
    f1_own = f1_score(y_test, y_pred_own)
    print(f"Test accuracy for our model l1: {accuracy_own:.4f}, F1-score: {f1_own:.4f}")
    return y_pred_own

def fit_sklearn_l1(X_train, y_train, X_test, y_test, max_iter=100):
    model_sklearn = LogisticRegression(penalty='l1',solver='liblinear', C=1.0, max_iter=max_iter, random_state=42)
    model_sklearn.fit(X_train, y_train)
    y_pred_sklearn = model_sklearn.predict(X_test)
    accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
    f1_sklearn = f1_score(y_test, y_pred_sklearn)
    print(f"Test accuracy for sklearn model l1: {accuracy_sklearn:.4f}, F1-score: {f1_sklearn:.4f}")
    return y_pred_sklearn
    
def fit_sklearn_no_penalty(X_train, y_train, X_test, y_test, max_iter=100):
    model_sklearn = LogisticRegression(penalty=None, C=1.0, max_iter=max_iter, random_state=42)
    model_sklearn.fit(X_train, y_train)
    y_pred_sklearn = model_sklearn.predict(X_test)
    accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
    f1_sklearn = f1_score(y_test, y_pred_sklearn)
    print(f"Test accuracy for sklearn model no penalty: {accuracy_sklearn:.4f}, F1-score: {f1_sklearn:.4f}")
    return y_pred_sklearn

# Arythmia

In [18]:
col_names = [f"attr{i}" for i in range(1, 280)]
df_arythmia = pd.read_csv("./data/arythmia/arrhythmia.data", header=None, names=col_names, na_values="?")
print(df_arythmia.head())
df_arythmia = df_arythmia.reset_index()
df_arythmia.columns = df_arythmia.columns.astype(str)


    attr1  attr2  attr3  attr4  attr5  attr6  attr7  attr8  attr9  attr10  \
75      0    190     80     91    193    371    174    121    -16    13.0   
56      1    165     64     81    174    401    149     39     25    37.0   
54      0    172     95    138    163    386    185    102     96    34.0   
55      0    175     94    100    202    380    179    143     28    11.0   
75      0    190     80     88    181    360    177    103    -16    13.0   

    attr11  attr12  attr13  attr14  attr15  attr16  attr17  attr18  attr19  \
75    64.0    -2.0     NaN    63.0       0      52      44       0       0   
56   -17.0    31.0     NaN    53.0       0      48       0       0       0   
54    70.0    66.0    23.0    75.0       0      40      80       0       0   
55    -5.0    20.0     NaN    71.0       0      72      20       0       0   
75    61.0     3.0     NaN     NaN       0      48      40       0       0   

    attr20  attr21  attr22  attr23  attr24  attr25  attr26  attr27  

class 0 is normal, class 1 are different types of arythmia

In [19]:
assign_target_column(df_arythmia, "attr279", [i for i in range(2,16)])
fill_na(df_arythmia)
delete_corr_columns(df_arythmia)
check_feature_sample_ratio(df_arythmia)
X, y, X_train,X_valid, X_test, y_train, y_valid, y_test = split(df_arythmia)

Unique values of label column [ 8  6 10  1  7 14  3 16  2  4  5  9 15]
target
0    267
1    185
Name: count, dtype: int64
attr10 8 null values
attr11 22 null values
attr12 1 null values
attr13 376 null values
attr14 1 null values
deleted features: ['attr160', 'attr163', 'attr170', 'attr184', 'attr192', 'attr193', 'attr194', 'attr203', 'attr207', 'attr211', 'attr214', 'attr217', 'attr224', 'attr234', 'attr243', 'attr254', 'attr263', 'attr273']
Shape after deletion (452, 262)
The number of features is enough: True, number of features: 262


In [20]:
y_no_penalty = fit_sklearn_no_penalty(X_train, y_train, X_test, y_test, max_iter=100)
y_l1 = fit_sklearn_l1(X_train, y_train, X_test, y_test, max_iter=100)
y_our = fit_our_model(X_train, y_train, X_test, y_test)
y_our_l1 = fit_our_model_l1(X_train, y_train, X_test, y_test)

Test accuracy for sklearn model no penalty: 0.6324, F1-score: 0.5763
Test accuracy for sklearn model l1: 0.6250, F1-score: 0.5641
Test accuracy for our model lambda 0: 0.7206, F1-score: 0.5957
Test accuracy for our model l1: 0.6691, F1-score: 0.4444


# Secom

In [21]:
df_data = pd.read_csv("./data/secom/secom.data", sep=' ', header=None, na_values="NaN")  
df_label = pd.read_csv("./data/secom/secom_labels.data",sep=' ', header=None, na_values="NaN")

df_label.columns = ["label", "timestamp"]
df_secom = pd.concat([df_data, df_label["label"]], axis=1)
print("Shape:", df_secom.shape)
df_secom.columns = df_secom.columns.astype(str)


Shape: (1567, 591)


In [22]:
assign_target_column(df_secom, "label", [1])
fill_na(df_secom)
delete_corr_columns(df_secom)
df_secom = add_permuted_columns(df_secom)
delete_corr_columns(df_secom)
check_feature_sample_ratio(df_secom)
X, y, X_train,X_valid, X_test, y_train, y_valid, y_test = split(df_secom)

Unique values of label column [-1  1]
target
0    1463
1     104
Name: count, dtype: int64
0 6 null values
1 7 null values
2 14 null values
3 14 null values
4 14 null values
5 14 null values
6 14 null values
7 9 null values
8 2 null values
9 2 null values
10 2 null values
11 2 null values
12 2 null values
13 3 null values
14 3 null values
15 3 null values
16 3 null values
17 3 null values
18 3 null values
19 10 null values
21 2 null values
22 2 null values
23 2 null values
24 2 null values
25 2 null values
26 2 null values
27 2 null values
28 2 null values
29 2 null values
30 2 null values
31 2 null values
32 1 null values
33 1 null values
34 1 null values
35 1 null values
36 1 null values
37 1 null values
38 1 null values
39 1 null values
40 24 null values
41 24 null values
42 1 null values
43 1 null values
44 1 null values
45 1 null values
46 1 null values
47 1 null values
48 1 null values
49 1 null values
50 1 null values
51 1 null values
52 1 null values
53 4 null values
54 4 null 

In [23]:
y_no_penalty = fit_sklearn_no_penalty(X_train, y_train, X_test, y_test, max_iter=2000)
y_l1 = fit_sklearn_l1(X_train, y_train, X_test, y_test, max_iter=100)
y_our = fit_our_model(X_train, y_train, X_test, y_test)
y_our_l1 = fit_our_model_l1(X_train, y_train, X_test, y_test)

Test accuracy for sklearn model no penalty: 0.8493, F1-score: 0.1647
Test accuracy for sklearn model l1: 0.8684, F1-score: 0.1389
Test accuracy for our model lambda 0: 0.9342, F1-score: 0.0000
Test accuracy for our model l1: 0.9342, F1-score: 0.0000


# braidflow

In [24]:
pd.set_option('display.max_columns', None)

dataset = openml.datasets.get_dataset(45966)

df_braidflow, y, categorical, numeric = dataset.get_data(dataset_format='dataframe', target=dataset.default_target_attribute)

df_braidflow["label"] = y
print("Shape:", df_braidflow.shape)
print(df_braidflow.head(1))
df_braidflow["label"].value_counts()

Shape: (72, 124)
   UID  task_id  action_absorption  action_accord  action_fluidity  care  \
0    1        5                  3              7                6     3   

   challenge_match  clear_mindedness  control  demand  \
0                4                 5        6       4   

   factor_absorption_by_activity  factor_fluency_of_performance  \
0                            3.5                       6.333333   

   factor_perceived_fit_of_skill_and_task_demands  \
0                                             5.0   

   factor_subjective_value_of_activity  focus  forward_thinking  fss_score  \
0                             2.333333      7                 7        5.2   

   loss_of_time_perception  lost_in_thought  perceived_difficulty  relevance  \
0                        6                1                     4          3   

   self_evaluation  worry task_braid  task_braid_revolutions  \
0                7      1     VBRAID                       6   

   task_braid_strands  bra

label
neutral    37
fall       18
flow       17
Name: count, dtype: int64

In [25]:
assign_target_column(df_braidflow, "label", ["flow"])
df_braidflow = drop_categorical(df_braidflow)
fill_na(df_braidflow)
delete_corr_columns(df_braidflow)
check_feature_sample_ratio(df_braidflow)
X, y, X_train,X_valid, X_test, y_train, y_valid, y_test = split(df_braidflow)

Unique values of label column ['neutral' 'flow' 'fall']
target
0    55
1    17
Name: count, dtype: int64
Number of columns to keep 118, to delete 6
disk_acc_mean_lateral 6 null values
disk_acc_mean_medial 6 null values
disk_acc_mean_overall 6 null values
disk_acc_peaks_lateral 6 null values
disk_acc_peaks_medial 6 null values
disk_acc_peaks_overall 6 null values
disk_acc_std_lateral 6 null values
disk_acc_std_medial 6 null values
disk_acc_std_overall 6 null values
disk_speed_mean_lateral 6 null values
disk_speed_mean_medial 6 null values
disk_speed_mean_overall 6 null values
disk_flips 21 null values
disk_heading_movements 6 null values
disk_heading_rhythm_5 6 null values
disk_heading_rhythm_10 6 null values
disk_heading_rhythm_15 6 null values
disk_heading_rhythm_20 6 null values
disk_heading_rhythm_25 6 null values
disk_heading_rhythm_30 6 null values
disk_heading_rhythm_35 6 null values
disk_heading_rhythm_40 6 null values
disk_heading_rhythm_45 6 null values
disk_heading_rhythm_res

deleted features: ['fss_score', 'task_braid_strands', 'braiding_statistics_revolutions', 'braiding_statistics_revwork', 'disk_acc_mean_overall', 'disk_acc_std_lateral', 'disk_acc_std_medial', 'disk_acc_std_overall', 'disk_speed_mean_overall', 'disk_heading_rhythm_10', 'disk_heading_rhythm_15', 'disk_heading_rhythm_20', 'disk_heading_rhythm_25', 'disk_heading_rhythm_30', 'disk_heading_rhythm_35', 'disk_heading_rhythm_40', 'disk_heading_rhythm_45', 'disk_heading_rhythm_resonant_score', 'wristband_acc_mean_overall', 'wristband_acc_std_overall', 'wristband_speed_mean_overall', 'heartrate_below_baseline', 'heartrate_range', 'heartrate_std', 'skintemperature_below_baseline', 'skintemperature_range', 'skintemperature_std', 'eda_phasic_scr_amplitude_range', 'eda_phasic_scr_amplitude_std', 'eda_phasic_scr_recoverytime_std', 'eda_tonic_mean', 'eda_tonic_std', 'emg_scr_amplitude_range', 'emg_scr_amplitude_std', 'emg_scr_recoverytime_std', 'emg_scr_risetime_std', 'emg_tonic_mean', 'emg_tonic_std',

In [26]:
y_no_penalty = fit_sklearn_no_penalty(X_train, y_train, X_test, y_test, max_iter=100)
y_l1 = fit_sklearn_l1(X_train, y_train, X_test, y_test, max_iter=100)
y_our = fit_our_model(X_train, y_train, X_test, y_test)
y_our_l1 = fit_our_model_l1(X_train, y_train, X_test, y_test)

Test accuracy for sklearn model no penalty: 0.7727, F1-score: 0.0000
Test accuracy for sklearn model l1: 0.6818, F1-score: 0.3636


Test accuracy for our model lambda 0: 0.8182, F1-score: 0.5000
Test accuracy for our model l1: 0.8636, F1-score: 0.5714


# lsvt

speech signal processing

In [27]:
dataset = openml.datasets.get_dataset(1484)
df_lsvt, y, categorical, numeric = dataset.get_data(dataset_format='dataframe', target=dataset.default_target_attribute)

df_lsvt["label"] = y
print("Shape:", df_lsvt.shape)
print(df_lsvt.head(1))
df_lsvt["label"].value_counts()

Shape: (126, 311)
         V1        V2       V3        V4        V5        V6        V7  \
0  0.088112  0.041697  0.00048 -0.000004  0.000422  2.458381  0.000001   

          V8          V9       V10       V11       V12       V13       V14  \
0  47.021079  1366.43039 -7.103323 -2.687924 -0.035674  2.849068  0.042287   

        V15       V16      V17       V18       V19      V20       V21  \
0  9.116401  0.000002  0.04192  0.000482  0.000004  0.00044  0.000055   

        V22  V23       V24  V25  V26  V27  V28       V29       V30       V31  \
0  0.000001  0.0  0.000001  0.0    0    0    0  0.042287  0.000204  0.069838   

         V32      V33       V34      V35      V36       V37      V38  \
0  11.566415  0.07716 -0.000064  0.08188  0.09207 -0.000057  0.08188   

        V39       V40      V41       V42       V43       V44       V45  \
0  0.100744  0.000057  0.08188  0.059512  1.012151  0.021821  0.076036   

        V46      V47       V48       V49       V50       V51        V52  \

label
2    84
1    42
Name: count, dtype: int64

In [28]:
assign_target_column(df_lsvt, "label", ['1'])
fill_na(df_lsvt)
delete_corr_columns(df_lsvt)
check_feature_sample_ratio(df_lsvt)
X, y, X_train,X_valid, X_test, y_train, y_valid, y_test = split(df_lsvt)

Unique values of label column ['1', '2']
Categories (2, object): ['1' < '2']
target
0    84
1    42
Name: count, dtype: int64
deleted features: ['V2', 'V3', 'V5', 'V7', 'V10', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V20', 'V21', 'V22', 'V29', 'V30', 'V32', 'V33', 'V35', 'V36', 'V38', 'V39', 'V41', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V50', 'V55', 'V61', 'V63', 'V70', 'V72', 'V73', 'V76', 'V79', 'V82', 'V126', 'V127', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V138', 'V139', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V151', 'V152', 'V153', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V

In [29]:
y_no_penalty = fit_sklearn_no_penalty(X_train, y_train, X_test, y_test, max_iter=1000)
y_l1 = fit_sklearn_l1(X_train, y_train, X_test, y_test, max_iter=1000)
y_our = fit_our_model(X_train, y_train, X_test, y_test)
y_our_l1 = fit_our_model_l1(X_train, y_train, X_test, y_test)

Test accuracy for sklearn model no penalty: 0.6316, F1-score: 0.5882
Test accuracy for sklearn model l1: 0.4211, F1-score: 0.4211
Test accuracy for our model lambda 0: 0.6842, F1-score: 0.6667
Test accuracy for our model l1: 0.8158, F1-score: 0.7742


# Generated

In [None]:
X, y = generate_dataset(0.3, 100, 5, 0.8, seed=42)

In [32]:
df = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
df["target"] = y
X, y, X_train,X_valid, X_test, y_train, y_valid, y_test = split(df_lsvt)

In [33]:
y_no_penalty = fit_sklearn_no_penalty(X_train, y_train, X_test, y_test, max_iter=1000)
y_l1 = fit_sklearn_l1(X_train, y_train, X_test, y_test, max_iter=1000)
y_our = fit_our_model(X_train, y_train, X_test, y_test)
y_our_l1 = fit_our_model_l1(X_train, y_train, X_test, y_test)

Test accuracy for sklearn model no penalty: 0.6316, F1-score: 0.5882
Test accuracy for sklearn model l1: 0.4211, F1-score: 0.4211
Test accuracy for our model lambda 0: 0.6842, F1-score: 0.6667
Test accuracy for our model l1: 0.8158, F1-score: 0.7742
