In [4]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

import numpy as np
from RENT import RENT

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import f1_score, precision_score, recall_score, matthews_corrcoef


from sklearn.datasets import load_breast_cancer

wisconsin = load_breast_cancer()
data = pd.DataFrame(wisconsin.data)
target = wisconsin.target

from sklearn.model_selection import train_test_split
original_train_data, original_test_data, original_train_labels, original_test_labels = \
train_test_split(data, target, random_state=0, shuffle=True)

In [5]:
run = True 
counter = 0
max_counts = 1
rs = 0
n_folds = 5

my_data = original_train_data.copy()
my_target = pd.DataFrame(original_train_labels.copy()) # must be DataFrame

In [6]:
# run RENT until counter reaches max_counts

while run==True:
    counter += 1
    # k-fold splits
    skf = StratifiedKFold(n_splits=n_folds, random_state = rs, shuffle=True)
    
    #for next RENT-run (if counter != max_count, use another random state to split the data)
    rs += 1
    
    # C and l1 and testsize-range parameters to try 
    my_C_params = [0.1, 1, 10]
    my_l1_params = [0.1, 0.5, 0.9]
    testsize_range = (0.25, 0.25)
    
    # store prediction scores of train and test data for each split in dataframe (end with a list of dataframes)
    fs_scores = list()
    for s in range(skf.get_n_splits()):
        fs_scores.append(pd.DataFrame(index=my_l1_params, columns=my_C_params))
    scores_pred = list()
    for s in range(skf.get_n_splits()):
        scores_pred.append(pd.DataFrame(index=my_l1_params, columns=my_C_params))
    
    # define all dicts where we want to store results
    incorrect_labels_dict = {}
    incorr_avg = {}
    incorr_max = {}
    incorr_min = {}
    count_features = {}
    union_features = {}
    intersection_features = {}
    score_prediction = {}
    models = {}
    
    # now we perform RENT feature selection
    for C in my_C_params:
        for l1 in my_l1_params:
            i = 0 # counter 
            
            incorr_labels, mod, scores, features = [], [], [], []
            df = pd.DataFrame(index=my_data.index)
            
            for train, test in skf.split(my_data, my_target):
                train_data = my_data.iloc[train,:]
                train_target = my_target.iloc[train,:]
                
                analysis = RENT.RENT_Classification(data=train_data,
                                                       target=train_target.iloc[:,0],
                                                       feat_names=train_data.columns,
                                                       C=[C],
                                                       l1_ratios=[l1],
                                                       parameter_selection=False,
                                                       poly='OFF',
                                                       testsize_range=testsize_range,
                                                       K=100,
                                                       method='logreg',
                                                       verbose=1)
                analysis.train()
                
                # perform feature selection
                sel_features = analysis.selectFeatures(tau_1=0.9, tau_2=0.9, tau_3=0.975)
                
                # store incorrect predictions
                incorrectness = analysis.summary_objects()
                
                # predict on test set to get prediction scores thereof
                sc = StandardScaler()
                train_sample = sc.fit_transform(train_data.iloc[:,sel_features])
                test_sample = sc.transform(my_data.iloc[test,sel_features])
                
                model = LR(penalty='none', max_iter=8000, solver='saga').fit(train_sample, train_target)
                score = matthews_corrcoef(my_target.iloc[test,:], model.predict(test_sample))
                
                # store results
                scores.append(score)
                mod.append(model)
                features.append(sel_features)
                incorr_labels.append(incorrectness.iloc[:,-1])
                
                df = pd.concat([df,incorrectness.iloc[:,-1]], axis=1)
                
                score_mat = analysis.get_enetParam_matrices()[0]
                
                fs_scores[i].loc[l1,C] = score_mat.values[0]
                scores_pred[i].loc[l1,C] = score
                
                i += 1
                
            # fill the dictionaries
            
            df.columns = ['fold{0}'.format(x+1) for x in range(skf.get_n_splits())]
            incorrect_labels_dict[(l1,C)] = df
            score_prediction[(l1,C)] = scores
            models[(l1,C)] = mod
            
            features_set = []
            for f in features:
                features_set.append(set(f))
                
            #union and intersection of all features across the k folds
            features_union = list(set.union(*features_set))
            features_intersection = list(set.intersection(*features_set)) 
            
            # count how often each feature was selected across the k folds (0 to k)
            features_count = pd.Series(0, index=features_union)
            for f in features:
                features_count.loc[f] += 1
            
            union_features[(l1,C)] = features_union
            intersection_features[(l1,C)] = features_intersection
            count_features[(l1,C)] = features_count
            
            # store average, maximal and minimal  incor. predicitons
            incorr_labels_avg = pd.DataFrame(np.round(list(np.apply_along_axis(np.nanmean,1,df))),\
                         index=my_data.index)
            incorr_labels_avg.columns = ["nr incorr"]

            incorr_labels_max = pd.DataFrame(list(np.apply_along_axis(np.nanmax,1,df)),\
                         index=my_data.index)
            incorr_labels_max.columns = ["nr incorr"]
            
            incorr_labels_min = pd.DataFrame(list(np.apply_along_axis(np.nanmin,1,df)),\
                         index=my_data.index)
            incorr_labels_min.columns = ["nr incorr"]
            
            
            incorr_avg[(l1,C)] = incorr_labels_avg
            incorr_max[(l1,C)] = incorr_labels_max
            incorr_min[(l1,C)] = incorr_labels_min

            
            
    # now find the best l1 and C
    mat_pred = pd.DataFrame(0, index=my_l1_params, columns=my_C_params)
    for s in scores_pred:
        mat_pred = mat_pred + s
    mat_pred = mat_pred/len(scores_pred)

    mat_train = pd.DataFrame(0, index=my_l1_params, columns=my_C_params)
    for s in fs_scores:
        mat_train = mat_train + s
    mat_train = mat_train/len(fs_scores)

    best_row, best_col  =np.where(mat_pred == np.max(mat_pred.values))
    best_l1 = mat_pred.index[np.max(best_row)]
    best_C = mat_pred.columns[np.min(best_col)]     
   
    run=False # we stop after one run... additional code could be added to run multiple times. Then we would need to 
    # store the reduced data matrix and rerun it
    # fs_data = my_data.loc[:, union_features[(best_l1,best_C)]]
        

TypeError: __init__() got an unexpected keyword argument 'parameter_selection'

In [None]:
best_l1

In [None]:
best_C

In [None]:
# for the best parameter combination: incorrect prediction % across all folds
incorrect_labels_dict[(best_l1, best_C)]

In [None]:
# average prediction error over all folds
incorr_avg[(best_l1, best_C)]

In [None]:
# count how often a feature was selected through the k folds (left column:feature, right column:count)
count_features[(best_l1, best_C)]

In [None]:
# features that were selected in at least one fold (all with count >=1)
union_features[(best_l1, best_C)]

In [None]:
# intersection of selected features ( all features with count k - selected in each fold)
intersection_features[(best_l1, best_C)]