# Feature Selection based on ReliefF from skebate

In [None]:
%pip install skrebate

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from skrebate import ReliefF
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split

In [3]:
path_static_features = '/Users/gioelepozzi/Desktop/data/features_thesis/static_features.csv'
path_static_features_EDA = '/Users/gioelepozzi/Desktop/data/features_thesis/static_features_EDA.csv'
path_dynamic_features = '/Users/gioelepozzi/Desktop/data/features_thesis/dynamic_features.csv'
path_dynamic_features = '/Users/gioelepozzi/Desktop/data/features_thesis/dynamic_features.csv'
path_dynamic_features_EDA = '/Users/gioelepozzi/Desktop/data/features_thesis/dynamic_features_EDA.csv'

static_features = pd.read_csv(path_static_features)
static_header = static_features[['music_ID']]
static_features = static_features.drop(columns=['music_ID'])

static_features_EDA = pd.read_csv(path_static_features_EDA)
static_header_EDA = static_features_EDA[['music_ID','subject_ID']]
static_features_EDA = static_features_EDA.drop(columns=['music_ID', 'subject_ID'])

dynamic_features = pd.read_csv(path_dynamic_features)
dynamic_header = dynamic_features[['music_ID', 'frame']]
dynamic_features = dynamic_features.drop(columns=['music_ID', 'frame'])

dynamic_features_EDA = pd.read_csv(path_dynamic_features_EDA)
dynamic_header_EDA = dynamic_features_EDA[['music_ID', 'subject_ID', 'frame']]
dynamic_features_EDA = dynamic_features_EDA.drop(columns=['music_ID', 'subject_ID', 'frame'])

In [4]:
# function to add to a dataframe the 'class' column for the ReliefF algorithm

def class_f(df):
    a = np.ones(df.shape[0]-10)
    b = np.zeros(10)
    c = np.concatenate((a,b), axis=0)
    data_c = {'class':c}
    dataframe_c = pd.DataFrame(data_c)
    data_f = df.join(dataframe_c).dropna()
    return data_f

In [5]:
static_features = class_f(static_features)
static_features_EDA = class_f(static_features_EDA)
dynamic_features = class_f(dynamic_features)
dynamic_features_EDA = class_f(dynamic_features_EDA)

In [6]:
# function that given a df return the array order of most relevant features

def feature_selection(df):
    features, labels = df.drop('class', axis=1).values, df['class'].values
    clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=100),RandomForestClassifier(n_estimators=100))
    #print('score:', np.mean(cross_val_score(clf, features, labels)))
    X_train, X_test, y_train, y_test = train_test_split(features, labels)
    fs = ReliefF()
    fs.fit(X_train, y_train)
    
    #count = 0
    #for feature_name, feature_score in zip(df.drop('class', axis=1).columns,fs.feature_importances_):
    #    print(count, feature_name, feature_score)
    #    count = count+1
    #print('Order of most relevant features:\n', fs.top_features_)
    
    return fs.top_features_

In [None]:
best_static_features = feature_selection(static_features)
best_static_features_EDA = feature_selection(static_features_EDA)
best_dynamic_features = feature_selection(dynamic_features)
best_dynamic_features_EDA = feature_selection(dynamic_features_EDA)

In [None]:
# function that returns the dataframes with the best 'max_features'-features

def best_df(df, features_array, max_features):
    best = features_array[:max_features]
    df_best = df.iloc[:,best]
    
    return df_best

In [None]:
best_static_df = best_df(df=static_features, features_array=best_static_features, max_features=10)
best_static_EDA_df = best_df(df=static_features_EDA, features_array=best_static_features_EDA, max_features=10)
best_dynamic_df = best_df(df=dynamic_features, features_array=best_dynamic_features, max_features=10)
best_dynamic_EDA_df = best_df(df=dynamic_features_EDA, features_array=best_dynamic_features_EDA, max_features=10)

In [None]:
best_static_df = static_header.join(best_static_df).dropna()
best_static_EDA_df = static_header_EDA.join(best_static_EDA_df).dropna()
best_dynamic_df = dynamic_header.join(best_dynamic_df).dropna()
best_dynamic_EDA_df = dynamic_header_EDA.join(best_dynamic_EDA_df).dropna()

In [None]:
# save the new df in .csv files

best_static_df.to_csv('static_features_RreliefF.csv', index=False)
best_static_EDA_df.to_csv('static_features_EDA_RreliefF.csv', index=False)
best_dynamic_df.to_csv('dynamic_features_RreliefF.csv', index=False)
best_dynamic_EDA_df.to_csv('dynamic_features_EDA_RreliefF.csv', index=False)