In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pymongo import MongoClient
import os
import warnings
import sys
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")

library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

PLOTS_PATH = os.path.join(library_path, 'plots')

print("Libraries imported successfully!")
print(f"Current working directory: {os.getcwd()}")

In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["Diagnosis_Severity_PD_Voice"]
collection = db["studies"]

print("ðŸ”„ Loading studies from MongoDB...")
fields_to_extract = {
    "doi"             : 1, 
    "title"           : 1,
    "year"            : 1, 
    "study_id"        : 1,
    "ml_approaches"   : 1,
    '_id'             : 0
}  # 1 = include, 0 = exclude
studies_cursor = collection.find({}, fields_to_extract)
studies_list = list(studies_cursor)


print(f"ðŸ“Š Total studies loaded: {len(studies_list)}")
print(f"ðŸ“„ Sample document keys: {list(studies_list[0].keys()) if studies_list else 'No documents found'}")

In [None]:
def get_feature_selection(experiment_list:list)->list:

    feat_extract = []

    for experiment in experiment_list:
        selection_pipe = experiment.get('feature_selection')
        if selection_pipe is None:
            selection_pipe = []
        else:
            selection_pipe = selection_pipe.get('methods', [])
        feat_extract += selection_pipe

    return list(set(feat_extract))

def experiments_with_feat_sel(experiment_list: list)->int:

    count = 0
    for experiment in experiment_list:
        if experiment.get('feature_selection') is not None:
            count+=1
    return count

In [None]:
experiment_df = pd.DataFrame(studies_list)
experiment_df['num_experiments'] = experiment_df['ml_approaches'].apply(lambda x: len(x))
experiment_df['exp_with_feat_sel'] = experiment_df['ml_approaches'].apply(lambda x: experiments_with_feat_sel(x))
experiment_df.head()

In [None]:
num_experiments = experiment_df['num_experiments'].sum()
experiments_with_feat_sel = experiment_df['exp_with_feat_sel'].sum()

print(f"Total number of experiments: {num_experiments}")
print(f"Number of experiments with feature selection: {experiments_with_feat_sel}")
print(f"NNumber of experiments without feature selection: {num_experiments-experiments_with_feat_sel}")
print(f"Percentage of experiments with feature selection: {experiments_with_feat_sel/num_experiments*100:.2f}%")
print(f"Number of papers: {experiment_df['doi'].nunique()}")

In [None]:
def clean_technique_name(technique):
    """
    Clean technique names by extracting the base strategy.
    Handles both colon-separated and parentheses-separated details.
    """
    if not isinstance(technique, str):
        return str(technique)
    
    # Remove leading/trailing whitespace
    cleaned = technique.strip()
    
    # Handle colon-separated details (e.g., "Recursive Feature Elimination: XGBoost")
    if ':' in cleaned:
        cleaned = cleaned.split(':')[0].strip()
    
    # Handle parentheses details (e.g., "PCA (Principal Component Analysis)")
    if '(' in cleaned:
        cleaned = cleaned.split('(')[0].strip()
    
    # Handle bracket details (e.g., "LASSO [L1 Regularization]")
    if '[' in cleaned:
        cleaned = cleaned.split('[')[0].strip()
    
    # Additional cleanup for common patterns
    # Remove trailing dashes or other separators
    cleaned = cleaned.rstrip(' -â€“â€”_')

    if cleaned.lower() == 't-test':
        cleaned = 'T-test'

    if cleaned.lower() == 'borutta':
        cleaned = 'Boruta'

    if cleaned.lower() == 'extra trees':
        cleaned = 'Extra Tree'

    if cleaned.lower() == 'f-score':
        cleaned = 'Fisher Score'

    if cleaned.lower() == 'relief':
        cleaned = 'ReliefF'

    if cleaned == 'Gray Wolf Optimization':
        cleaned = 'Grey Wolf Optimization'

    if cleaned == 'Sequential Forward Selection' or cleaned == 'Sequential Feature Selection':
        cleaned = 'Sequential Forward Feature Selection'

    if cleaned == 'Backward Stepwise Regression' or cleaned == 'Sequential Backward Selection':
        cleaned = 'Backward Stepwise Selection'
    
    return cleaned

In [None]:
experiment_df['feat_selection'] = experiment_df['ml_approaches'].apply(
    lambda x: get_feature_selection(x)
)

selection_df = experiment_df.explode('feat_selection')
selection_df.head(7)

In [None]:
selection_df['doi'].nunique()

In [None]:
selection_df['cleaned_technique'] = selection_df['feat_selection'].apply(clean_technique_name)
selection_df.head(7)

In [None]:
print(f"Total unique techniques: {selection_df['cleaned_technique'].nunique():,}")
print(f"Total of papers: {selection_df['doi'].nunique():,}")

In [None]:
tech_by_paper = selection_df.groupby(by='doi', as_index=False).agg({'cleaned_technique': set})
tech_by_paper = tech_by_paper.explode('cleaned_technique')

In [None]:
# Correct logic: use sets to find mutually exclusive and overlapping groups
mask_no_feature_extraction = (tech_by_paper['cleaned_technique'] == 'nan')
mask_feature_extraction = (tech_by_paper['cleaned_technique'] != 'nan')

dois_no_feat_ext = set(tech_by_paper[mask_no_feature_extraction]['doi'])
dois_feat_ext = set(tech_by_paper[mask_feature_extraction]['doi'])

only_no_feat_ext = dois_no_feat_ext - dois_feat_ext
only_feat_ext = dois_feat_ext - dois_no_feat_ext
both_types = dois_no_feat_ext & dois_feat_ext

print(f"Total of papers without feature extraction only: {len(only_no_feat_ext):,}")
print(f"Papers with feature extraction only: {len(only_feat_ext):,}")
print(f"Papers with both feature extraction and no feature extraction entries: {len(both_types):,}")

In [None]:
tech_by_paper = tech_by_paper[tech_by_paper['doi'].isin(dois_feat_ext)].reset_index(drop=True)
tech_by_paper.head()

In [None]:
technique_rank = tech_by_paper['cleaned_technique'].value_counts().reset_index()
technique_rank.columns = ['technique', 'count']
technique_rank['percentage'] = np.round(technique_rank['count'] / technique_rank['count'].sum() * 100, 2)
technique_rank.head(10)

In [None]:
alpha_tech = technique_rank.sort_values('technique')
alpha_tech

In [None]:
import textwrap

def break_long_names(name, max_len=22, max_lines=3):
    wrapped = textwrap.wrap(
        name,
        width=max_len,
        break_long_words=False,
        break_on_hyphens=False
    )

    # Limit number of lines
    if len(wrapped) > max_lines:
        wrapped = wrapped[:max_lines]
        wrapped[-1] += "â€¦"

    return "\n".join(wrapped)

In [None]:
top10_techniques = technique_rank.head(10)
from matplotlib.ticker import MaxNLocator


# Seaborn style
sns.set_style("whitegrid")

# Figure size (single-column journal size)
plt.figure(figsize=(9, 6))

# Calculate percentages
percentages = top10_techniques['percentage']

# Bar plot
ax = sns.barplot(
    x=top10_techniques['count'],
    y=top10_techniques['technique'],
    color="#4C72B0",
    edgecolor="black"
)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

# Labels and title
ax.set_title("Most Used Feature Selection Techniques", fontsize=18, pad=12)
ax.set_xlabel("Number of Papers", fontsize=14)
ax.set_ylabel("Feature Selection Technique", fontsize=14)

# Ticks
ax.tick_params(axis='both', labelsize=11)

# Add value and percentage labels on bars (closer to the end of the bars)
for i, (v, pct) in enumerate(zip(top10_techniques['count'], percentages)):
    ax.text(v - 0.5, i, f"{pct:.1f}%", va='center', ha='right', fontsize=12, fontweight='bold', color='white')

# Improve grid appearance
ax.grid(axis='x', color="#E5E5E5")
ax.grid(axis='y', visible=False)

# Tight layout for clean export
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_PATH, 'feat_sel_barplot.svg'), dpi=600)


In [None]:
mult_techniques = tech_by_paper.groupby('doi').agg({'cleaned_technique': set})
mult_techniques['num_techniques'] = mult_techniques['cleaned_technique'].apply(lambda x: len(x))
mult_techniques.head()

In [None]:
num_one_technique = mult_techniques[mult_techniques['num_techniques'] == 1].shape[0]
num_multiple_techniques = mult_techniques[mult_techniques['num_techniques'] > 1].shape[0]

In [None]:
# Create data for the stacked bar plot (single horizontal stacked bar, no y-label)
categories = ['']  # Empty string to hide y-label
counts = [len(only_no_feat_ext), num_one_technique, num_multiple_techniques]
labels = ['No Feature Extraction', 'One Technique', 'Multiple Techniques']

# Calculate percentages
total_papers = sum(counts)
percentages = [count/total_papers*100 for count in counts]

# Create the horizontal stacked bar plot
plt.figure(figsize=(9, 2))

# Colors for each segment
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

lefts = 0
bars = []
for i, (count, color, label, pct) in enumerate(zip(counts, colors, labels, percentages)):
    bar = plt.barh(categories, count, left=lefts, color=color, edgecolor='black', linewidth=1.5, label=f'{label} ({count}, {pct:.1f}%)')
    # Add value and percentage label in the middle of each segment
    plt.text(lefts + count/2, 0, f'{count}\n({pct:.1f}%)', va='center', ha='center', fontsize=11, fontweight='bold', color='white')
    lefts += count

# Customize the plot
plt.title('Distribution of Feature Selection Usage in Papers', fontsize=16, pad=20)
plt.xlabel('Number of Papers', fontsize=14)
plt.yticks([])  # Remove y-tick label

# Add legend
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=11)

# Set x-axis to show integers only
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))

# Add grid for better readability
plt.grid(axis='x', alpha=0.3, linestyle='--')

# Adjust layout
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_PATH, 'feat_selection_stacked.svg'), dpi=600)

# Display summary statistics
print(f"Total papers analyzed: {total_papers}")
print(f"Papers with no feature extraction: {len(only_no_feat_ext)} ({percentages[0]:.1f}%)")
print(f"Papers with one technique: {num_one_technique} ({percentages[1]:.1f}%)")
print(f"Papers with multiple techniques: {num_multiple_techniques} ({percentages[2]:.1f}%)")

In [None]:
def experiments_with_feat_ext_selection(experiment_list: list)->dict:

    counter = {
        'feat_ext_only': 0,
        'feat_sel_only': 0,
        'both': 0,
        'neither': 0
    }
    for experiment in experiment_list:

        has_feature_extraction = experiment.get('feature_extraction') is not None
        has_feature_selection = experiment.get('feature_selection') is not None

        if has_feature_extraction and has_feature_selection:
            counter['both'] += 1
        elif has_feature_extraction and not has_feature_selection:
            counter['feat_ext_only'] += 1
        elif has_feature_selection and not has_feature_extraction:
            counter['feat_sel_only'] += 1
        else:
            counter['neither'] += 1
    return counter

def experiment_meta_counter(experiment_collection)->dict:

    total_counter = {
        'feat_ext_only': 0,
        'feat_sel_only': 0,
        'both': 0,
        'neither': 0
    }

    for experiment in experiment_collection:
        exp_counter = experiments_with_feat_ext_selection(experiment)
        for key in total_counter.keys():
            total_counter[key] += exp_counter[key]

    return total_counter

import pandas as pd

def paper_meta_counter(data_df: pd.DataFrame,doi_col: str, experiment_col: str)->pd.DataFrame:

    summary_df = pd.DataFrame(columns=['doi', 'feat_ext_only', 'feat_sel_only', 'both', 'neither'])

    

    for idx, row in data_df.iterrows():
        doi = row[doi_col]
        experiment_collection = row[experiment_col]

        exp_counter = experiments_with_feat_ext_selection(experiment_collection)

        exp_counter['doi'] = doi

        summary_df = pd.concat([summary_df, pd.DataFrame([exp_counter])], ignore_index=True)

    return summary_df

In [None]:
experiments_with_feat_ext_selection(experiment_df['ml_approaches'].iloc[0])

In [None]:
experiment_meta_counter(experiment_df['ml_approaches'].to_list())

In [None]:
summary_df = paper_meta_counter(experiment_df, 'doi', 'ml_approaches')
summary_df = summary_df.groupby(by='doi').sum().reset_index()
summary_df.head()

In [None]:
mask_no_extraction = (summary_df['feat_ext_only'] == 0)
mask_no_selection = (summary_df['feat_sel_only'] == 0)
mask_both = (summary_df['both'] > 0)
mask_neither = (summary_df['neither'] > 0)

In [None]:
summary_df[mask_both].shape

In [None]:
summary_df[~mask_both & mask_no_extraction & mask_no_selection & mask_neither].shape

In [None]:
selection_df[selection_df['cleaned_technique']=='Hybrid Grey Wolf-Whale Optimization']

In [None]:
selection_df['cleaned_technique'].unique()