In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import RFE

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from collections import Counter
import copy

from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

# tuning for optimal number of features

In [None]:
def optimal_num_features(X, y, num_features=range(100,350,50), step=range(20,35,5), n_jobs=1, random_state=1, cv=StratifiedKFold()):
    # X: dataframe, features_opensmile
    # y: dataframe, at least have column 'file' and 'outcome'
    random_grid = {'clf__n_features_to_select': num_features,
                   'clf__step': step}
    
    merged=X.merge(y, on='file')
    X_ = merged.loc[:,X.columns].values
    X_ = np.cbrt(X_)
    y_ = merged['outcome'].values
    
    estimator = RandomForestClassifier(random_state=random_state, n_jobs=n_jobs, class_weight='balanced')
    clf = RFE(estimator=estimator)
    pipeline = Pipeline([('scale', StandardScaler()),
                         ("smote", SMOTE(random_state=random_state, n_jobs=n_jobs)), 
                         ("clf", clf)])
    search = RandomizedSearchCV(pipeline, random_grid, scoring="balanced_accuracy", n_iter=100,
                              random_state=random_state, n_jobs=n_jobs, cv=cv, return_train_score=True)
    search.fit(X_, y_)
    best_score = search.best_score_
    print(f"Best Tuning balanced accuracy: {best_score}")
    best_params = {
        key.replace("clf__", ""): value for key, value in search.best_params_.items()
    }
    print(best_params)
    
    best_num = best_params['n_features_to_select']
    best_step = beat_params['step']
    
    return best_num, best_step

In [None]:
kf = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=True)
best_num, best_step = optimal_num_features(features_opensmile, clinic, n_jobs=14, random_state=42, cv=kf)

# select features using optimal number of features and steps

In [None]:
def features_rfe(X, y, num_features=100, step=10, n_jobs=1, random_state=1, cv=StratifiedKFold()):
    # X: dataframe, features_opensmile
    # y: dataframe, at least have column 'file' and 'outcome'
    
    merged=X.merge(y, on='file')
    X_ = merged.loc[:,X.columns].values
    X_ = np.cbrt(X_)
    y_ = merged['outcome'].values
    
    df_var = pd.DataFrame()
    best_score = 0
    f_best_score = []
    
    f = 0
    for train_index, test_index in cv.split(X_, y_):
        f+=1
        print('fold_'+str(f))
        # df_proba_ = pd.DataFrame()
        
        # split data
        X_train = X_[train_index]
        y_train = y_[train_index]
        
        X_test = X_[test_index]
        y_test = y_[test_index]
        
        # scaling
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # oversampling training dataset
        smote = SMOTE(random_state=random_state, n_jobs=n_jobs)
        X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
        
        estimator = RandomForestClassifier(random_state=random_state, n_jobs=n_jobs, class_weight='balanced')
        
        rfe = RFE(estimator=estimator, n_features_to_select=num_features, step=step)
        rfe.fit(X_train_scaled, y_train)
        df_var['f'+str(f)] = np.array(variables)[rfe.support_]
        
        y_pred = rfe.predict(X_test_scaled)
        score_test = metrics.balanced_accuracy_score(y_test, y_pred)
        print(f"Best estimator balanced accuracy in test dataset: {score_test}")
        
        if score_test > best_score:
            best_score = score_test
            f_best_score = 'f'+str(f)
    feature_set = df_var[f_best_score].values
    return feature_set

In [None]:
feature_set = feature_rfe(feature_opensmile, clinic, num_feature=best_num, step=best_step, n_jobs=14, random_state=42, cv=kf)