In [1]:
import re

def extract_number_of_features_from_file(filename):
    with open(filename, 'r') as f:
        for line in f:
            match = re.search(r"number_of_features:\s*(\d+)", line)
            if match:
                return int(match.group(1))
    return None

filename = "Results/main_log.txt"
number_of_features = extract_number_of_features_from_file(filename)
print(number_of_features)

188


In [2]:
### Generating Mean and Max Results Tables
import pandas as pd
import numpy as np
import os

path = f"FinalResults"
if not os.path.isdir(path):
    os.makedirs(path)

def round_2(number):
    return np.round(number, 2)

random_state_numbers = [7, 42, 75, 101, 216]
categorization_rules = "R2B2"
preprocessing_methods = ["MinMax", "Standard"]
target_columns = ["r_dicho", "b_dicho"]
results_path = f'Results/3_Algorithms_Results'
unWanted_columns = [
    "best_param_fold_1","best_param_fold_2",
    "best_param_fold_3","best_param_fold_4",
    "best_param_fold_5"
]
desired_columns_order = [
    "Scaling", "Alg",
    "Acc Cv", "Acc Te",
    "Sp", "Se", "MCC", "F1",
    "AUC-ROC", "AUPR"
]
new_algorithms_names = ["DT", "LR", "RF", "SVM", "GB"]

for target_column in target_columns:
    mean_df_all = pd.DataFrame()
    max_df_all = pd.DataFrame()
    for preprocessing_method in preprocessing_methods:
        final_df = pd.DataFrame()
        for random_state in random_state_numbers:
            tmp_path = f'{results_path}/{target_column}/{random_state}_{preprocessing_method}'
            tmp_resultFile = pd.read_csv(f'{tmp_path}/results_df.csv')
            final_df = pd.concat([final_df, tmp_resultFile], ignore_index=True)
          
        
        tmp_mean_df = final_df.groupby(['Algorithm']).mean(numeric_only=True).drop(columns='Number_of_Features').reset_index()
        tmp_std_df = final_df.groupby(['Algorithm']).std(numeric_only=True).drop(columns='Number_of_Features').reset_index()
        
        final_mean_df = pd.DataFrame({'Algorithm': tmp_mean_df['Algorithm'].values.tolist()})
        for column in tmp_std_df.columns:
            if "_mean" in column:
                column_value = []
                for i in range(len(tmp_mean_df)):
                    new_value = f'{round_2(tmp_mean_df.loc[i, column])} \u00B1 {round_2(tmp_std_df.loc[i, column])}'
                    column_value.append(new_value)
                final_mean_df[column] = column_value

        final_mean_df.insert(0, "preprocessing_method", preprocessing_method)


        tmp_max_df = final_df.groupby(['Algorithm']).max().drop(columns='Number_of_Features').reset_index()

        final_max_df = pd.DataFrame({'Algorithm': tmp_max_df['Algorithm'].values.tolist()})
        for column in tmp_max_df.columns:
            if "_mean" in column:
                column_value = []
                for i in range(len(tmp_max_df)):
                    Number_of_Features = final_df[
                        (final_df[column] == tmp_max_df.loc[i, column])&(final_df['Algorithm']==tmp_max_df.loc[i, 'Algorithm'])
                    ]['Number_of_Features'].to_list()[0]
                    new_value = (round_2(tmp_max_df.loc[i, column]), Number_of_Features)                         
                    column_value.append(new_value)
                final_max_df[column.replace("_mean","_max")] = column_value

        final_max_df.insert(0, "preprocessing_method", preprocessing_method)
        
        mean_df_all = pd.concat([mean_df_all, final_mean_df])  
        max_df_all = pd.concat([max_df_all, final_max_df])
    
    
    mean_df_all.columns = desired_columns_order
    max_df_all.columns = desired_columns_order

    mean_df_all.to_csv(f'{path}/{target_column}_mean.csv', index=False)

    max_df_all.to_csv(f'{path}/{target_column}_max.csv', index=False)

In [4]:
### Drowning AUC-ROC and AUPR Chart
%matplotlib inline
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import numpy as np
import os

path = f"FinalResults"
if not os.path.isdir(path):
    os.makedirs(path)

# Set the custom font settings for scientific paper publications
mpl.rcParams.update({
    'font.family': 'serif',
    'font.serif': ['Times New Roman'],
    'font.size': 18,
    'font.weight': 'normal',
    'axes.labelsize': 18,
    'axes.titlesize': 18,
    'axes.titleweight': 'bold',
    'xtick.labelsize': 18,
    'ytick.labelsize': 18,
    'legend.fontsize': 14,
})
plt.close('all')

random_state_numbers = [7, 42, 75, 101, 216]
categorization_rules = "R2B2"
preprocessing_methods = ["MinMax", "Standard"]
target_columns = ["r_dicho", "b_dicho"]
results_path = f'Results/3_Algorithms_Results'
wanted_columns = [
    'Algorithm', 'Number_of_Features',
    'auc_roc_mean', 'aupr_mean'
]
metric_cols = ['auc_roc_mean', 'aupr_mean']
desired_columns_order = [
    "Scaling", "Alg",
    "AUC-ROC", "AUPR"
]
al_names = ["LR", "GB"]

# Define the line styles for each line
line_styles = [
            (0, (1, 0)),      # solid
            (0, (1, 1)),      # dotted
            (0, (5, 1)),      # dashed
            (0, (3, 1, 1, 1)), # dashdot
            (0, (3, 1, 1, 1, 1, 1))  # densely dashdotdotted
        ]
marker_styles = ["o", "^", "p", "s", "D"]

for target_column in target_columns:
    for preprocessing_method in preprocessing_methods:
        df_list = []
        for random_state in random_state_numbers:
            tmp_path = f'{results_path}/{target_column}/{random_state}_{preprocessing_method}'
            tmp_resultFile = pd.read_csv(f'{tmp_path}/results_df.csv')
            df_list.append(tmp_resultFile[wanted_columns])
        
        final_df = pd.concat(df_list)
        tmp_mean_df = final_df.groupby(['Algorithm', 'Number_of_Features']).mean().reset_index()
        # Set the index of the DataFrame as a MultiIndex based on 'Algorithm' and 'Number_of_Features'
        tmp_mean_df.set_index(['Algorithm', 'Number_of_Features'], inplace=True)
        
        for metric in metric_cols:
            fig = plt.figure(figsize=(16, 9))
            gs = GridSpec(1, 1)

            ax_scores = fig.add_subplot(gs[0, 0])
            
            # Plot each series with the corresponding line style
            for i, col in enumerate(tmp_mean_df[metric].unstack(level=0).columns):
                ax_scores.plot(
                    tmp_mean_df[metric].unstack(level=0).index,
                    tmp_mean_df[metric].unstack(level=0)[col],
                    linestyle=line_styles[i % len(line_styles)],  # Cycle through the line styles
                    marker=marker_styles[i % len(marker_styles)],  # Cycle through the marker styles
                    label=col
                )
                
            # show grid lines
            tick_positions = np.arange(0, number_of_features + 1, 10)
            tick_positions = np.insert(tick_positions, 1, 1)
            if tick_positions[-1] != number_of_features:
                tick_positions = np.append(tick_positions, number_of_features)

            ax_scores.set_xticks([i for i in range(1, number_of_features+1)])
            ax_scores.set_xticklabels(['' if i not in tick_positions else str(i) for i in range(1, number_of_features+1)])

            title = metric.replace("_", " ").title()
            if "Aupr" in title:
                title = title.replace("Aupr","AUPR")
            if "Auc Roc" in title:
                title = title.replace("Auc Roc","AUC-ROC")

            # ax_scores.set_title(title)
            ax_scores.set_xlabel('Number of Features')
            ax_scores.set_ylabel('Score (%)')
            ax_scores.grid(True)
            
            # show the legend
            ax_scores.legend(loc='upper left')

            # show/save the plot
            plt.tight_layout()
            plt.savefig(
                f"{path}/{target_column}_{preprocessing_method}_{metric}.png",
                bbox_inches='tight', facecolor='white', dpi=120
            )
            # plt.show()
            plt.close()

In [20]:
### Drowning AUC-ROC and AUPR Final Bar-Chart

%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np

# Set the custom font settings for scientific paper publications
mpl.rcParams.update({
    'font.family': 'serif',
    'font.serif': ['Times New Roman'],
    'font.size': 18,
    'font.weight': 'normal',
    'axes.labelsize': 18,
    'axes.titlesize': 18,
    'axes.titleweight': 'bold',
    'xtick.labelsize': 18,
    'ytick.labelsize': 18,
    'legend.fontsize': 14,
})

# Function to extract the metric values and standard deviations
def extract_values_with_std(cell_value):
    value, std = cell_value.split(' Â± ')
    return float(value), float(std)

# Create the grouped 2x2 plot
fig, axs = plt.subplots(2, 2, figsize=(12, 12))

target_columns = ["r_dicho", "b_dicho"]
metric_cols = ['AUC-ROC', 'AUPR']
xlabels = ['a', 'b', 'c', 'd'] 

tmp_data = pd.DataFrame()

ax_index = 0
for tr in target_columns:

    file_name = f'{path}/{tr}_mean'

    # Read the data from the CSV file
    data = pd.read_csv(f'{file_name}.csv')

       
    for col in metric_cols:
        data[col], data[col + ' std'] = zip(*data[col].map(extract_values_with_std))
    
    # Extract the necessary columns for plotting
    alg_names = data['Alg'].unique()

    
    # Create grouped plots for each metric in the 4x2 grid
    for j, metric in enumerate(metric_cols):   

        row = ax_index // 2
        col = ax_index % 2
        ax = axs[row, col]

        width = 0.4
        r = width/2
        x = np.arange(len(alg_names))
        
        ax.bar(x - width/2, data[data['Scaling'] == 'MinMax'][metric], width, label='MinMax', color='skyblue', edgecolor='black')
        ax.bar(x + width/2, data[data['Scaling'] == 'Standard'][metric], width, label='Standard', color='orange', edgecolor='black')

        # Add error bars for standard deviation
        ax.errorbar(x - width/2, data[data['Scaling'] == 'MinMax'][metric],
                    yerr=data[data['Scaling'] == 'MinMax'][metric + ' std'],
                    fmt='none', ecolor='black', capsize=3, elinewidth=1, marker='_', markersize=6)
        ax.errorbar(x + width/2, data[data['Scaling'] == 'Standard'][metric],
                    yerr=data[data['Scaling'] == 'Standard'][metric + ' std'],
                    fmt='none', ecolor='black', capsize=3, elinewidth=1, marker='_', markersize=6)
        
        ax.set_xlabel(f'({xlabels[ax_index]})')
        ax.set_ylabel('Score (%)')
        ax.set_title(metric_cols[j])
        
        # Set the ylim based on stds
        min_val = min(min(data[data['Scaling'] == 'MinMax'][metric] - 2 * data[data['Scaling'] == 'MinMax'][metric + ' std']),
                    min(data[data['Scaling'] == 'Standard'][metric] - 2 * data[data['Scaling'] == 'Standard'][metric + ' std']))
        
        max_val = max(max(data[data['Scaling'] == 'MinMax'][metric] + 2 * data[data['Scaling'] == 'MinMax'][metric + ' std']),
                    max(data[data['Scaling'] == 'Standard'][metric] + 2 * data[data['Scaling'] == 'Standard'][metric + ' std']))
        ax.set_ylim(min_val, max_val)

        # Show algorithm names below the bars
        ax.set_xticks(x)
        ax.set_xticklabels(alg_names, ha='center')

        # Set the legend for the grouped bars
        ax.legend(loc='upper left')

        ax_index += 1

# Adjust the layout and spacing
plt.tight_layout()

# Save the figure as an image file for publication
plt.savefig(f'{path}/bar_plot.png', bbox_inches='tight', facecolor='white', dpi=120)

plt.close()

In [None]:
### Generating Feature Ranking Mean

import pandas as pd
import os

results_path = "Results/2_Prepared_Data"
random_state_numbers = [7, 42, 75, 101, 216]
preprocessing_methods = ["MinMax", "Standard"]
target_columns = ['r_dicho', 'b_dicho']
nFolds = 5


def get_finalScores(features_lists, features):
    final_scores = pd.DataFrame(data = {'Features':features})
    for i in range(len(features_lists)):
        tmp_scores = []
        for feature in final_scores["Features"]:
            tmp_scores.append(features_lists[i].index(feature)+1)
        final_scores[i+1] = tmp_scores

    modes = []
    means = []
    stds = []
    for i in range(len(final_scores)):
        data = final_scores.iloc[i, 1:]
        modes.append(data.mode()[0])
        means.append(data.mean())
        stds.append(data.std())
    final_scores['Mode'] = modes
    final_scores['Mean'] = means
    final_scores['Std'] = stds
    
    final_scores = final_scores.sort_values(by=['Mode', 'Mean', "Std"])
    final_scores.reset_index(drop=True, inplace=True)
    return final_scores

def get_aggregatedScores(features_lists, features):
    final_scores = pd.DataFrame(data = {'Features':features})
    for key, value in features_lists.items():
        tmp_scores = []
        for feature in final_scores["Features"]:
            tmp_scores.append(value.index(feature)+1)
        final_scores[key] = tmp_scores

    final_scores = final_scores.sort_values(by=list(features_lists.keys())[0])
    final_scores.reset_index(drop=True, inplace=True)
    return final_scores

agg_list = {}
for tr in target_columns:
    for pp in preprocessing_methods:
        features_list = []
        for rs in random_state_numbers:
            tmp_path = f'{results_path}/{tr}/{rs}_{pp}/ranked_features'

            for i in range(1, nFolds+1):
                tmp_rankedFeatures = pd.read_csv(f'{tmp_path}/KBest_fold_{i}.csv')
                features_list.append(tmp_rankedFeatures["Features"].tolist())

        final_df = get_finalScores(features_list, features_list[0])        
        final_df.to_csv(f"{path}/{tr}_KBest_rankedFeatures_{pp}.csv", index=False)
        
        agg_list[f'{tr}_{pp}'] = final_df['Features'].tolist()

agg_dfs = get_aggregatedScores(agg_list, list(agg_list.values())[0])
tmp_features = agg_dfs["Features"].tolist()
agg_dfs.to_csv(f"{path}/final_KBest_rankedFeatures.csv", index=False)