In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pymongo import MongoClient
import os
import warnings
import sys
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")

library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

PLOTS_PATH = os.path.join(library_path, 'plots')

print("Libraries imported successfully!")
print(f"Current working directory: {os.getcwd()}")

In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["Diagnosis_Severity_PD_Voice"]
collection = db["studies"]

print("ðŸ”„ Loading studies from MongoDB...")
fields_to_extract = {
    "doi"             : 1, 
    "year"            : 1, 
    "study_id"        : 1,
    "ml_approaches"   : 1,
    '_id'             : 0
}  # 1 = include, 0 = exclude
studies_cursor = collection.find({}, fields_to_extract)
studies_list = list(studies_cursor)


print(f"ðŸ“Š Total studies loaded: {len(studies_list)}")
print(f"ðŸ“„ Sample document keys: {list(studies_list[0].keys()) if studies_list else 'No documents found'}")

In [None]:
def get_scaling(experiment_list:list)->list:

    scaling_list = []

    for experiment in experiment_list:
        scaling = experiment.get('scaling')
        if scaling is None:
            scaling = []
        scaling_list += scaling

    return list(set(scaling_list))

def experiments_with_scaling(experiment_list: list)->int:

    count = 0
    for experiment in experiment_list:
        if experiment.get('scaling') is not None:
            count+=1
    return count

In [None]:
experiment_df = pd.DataFrame(studies_list)
experiment_df['num_experiments'] = experiment_df['ml_approaches'].apply(lambda x: len(x))
experiment_df['exp_with_scaling'] = experiment_df['ml_approaches'].apply(lambda x: experiments_with_scaling(x))
experiment_df.head()

In [None]:
num_experiments = experiment_df['num_experiments'].sum()
experiments_with_scaling = experiment_df['exp_with_scaling'].sum()

print(f"Total number of experiments: {num_experiments}")
print(f"Number of experiments with scaling: {experiments_with_scaling}")
print(f"NNumber of experiments without scaling: {num_experiments-experiments_with_scaling}")
print(f"Percentage of experiments with scaling: {experiments_with_scaling/num_experiments*100:.2f}%")
print(f"Number of papers: {experiment_df['doi'].nunique()}")

In [None]:
experiment_df['scaling'] = experiment_df['ml_approaches'].apply(
    lambda x: get_scaling(x)
)

scaling_df = experiment_df.explode('scaling')
scaling_df.head(7)

In [None]:
print(f"Total unique techniques: {scaling_df['scaling'].nunique():,}")
print(f"Total of papers: {scaling_df['doi'].nunique():,}")

In [None]:
tech_by_paper = scaling_df.groupby(by='doi', as_index=False).agg({'scaling': set})
tech_by_paper = tech_by_paper.explode('scaling')

In [None]:
# Correct logic: use sets to find mutually exclusive and overlapping groups
mask_no_scaling = (tech_by_paper['scaling'].isna())
mask_scaling = (tech_by_paper['scaling'].notna())

dois_no_feat_ext = set(tech_by_paper[mask_no_scaling]['doi'])
dois_feat_ext = set(tech_by_paper[mask_scaling]['doi'])

only_no_scaling = dois_no_feat_ext - dois_feat_ext
only_scaling = dois_feat_ext - dois_no_feat_ext
both_types = dois_no_feat_ext & dois_feat_ext

print(f"Total of papers without scaling only: {len(only_no_scaling):,}")
print(f"Papers with scaling only: {len(only_scaling):,}")
print(f"Papers with both scaling and no scaling entries: {len(both_types):,}")

In [None]:
tech_by_paper = tech_by_paper[tech_by_paper['doi'].isin(only_scaling)].reset_index(drop=True)
tech_by_paper.head()

In [None]:
technique_rank = tech_by_paper['scaling'].value_counts().reset_index()
technique_rank.columns = ['technique', 'count']
technique_rank['percentage'] = np.round(technique_rank['count'] / technique_rank['count'].sum() * 100, 2)
technique_rank.head(10)

In [None]:
import textwrap

def break_long_names(name, max_len=22, max_lines=3):
    wrapped = textwrap.wrap(
        name,
        width=max_len,
        break_long_words=False,
        break_on_hyphens=False
    )

    # Limit number of lines
    if len(wrapped) > max_lines:
        wrapped = wrapped[:max_lines]
        wrapped[-1] += "â€¦"

    return "\n".join(wrapped)

In [None]:
top3_techniques = technique_rank.head(3)
from matplotlib.ticker import MaxNLocator


# Seaborn style
sns.set_style("whitegrid")

# Figure size (single-column journal size)
plt.figure(figsize=(9, 6))

# Calculate percentages
percentages = top3_techniques['percentage']

# Bar plot
ax = sns.barplot(
    x=top3_techniques['count'],
    y=top3_techniques['technique'],
    color="#4C72B0",
    edgecolor="black"
)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

# Labels and title
ax.set_title("Most Used Feature Extraction Techniques", fontsize=18, pad=12)
ax.set_xlabel("Number of Papers", fontsize=14)
ax.set_ylabel("Feature Extraction Technique", fontsize=14)

# Ticks
ax.tick_params(axis='both', labelsize=11)

# Add value and percentage labels on bars (closer to the end of the bars)
for i, (v, pct) in enumerate(zip(top3_techniques['count'], percentages)):
    ax.text(v - 0.4, i, f"{pct:.1f}%", va='center', ha='right', fontsize=11, fontweight='bold', color='white')

# Improve grid appearance
ax.grid(axis='x', color="#E5E5E5")
ax.grid(axis='y', visible=False)

# Tight layout for clean export
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_PATH, 'scaling_barplot.svg'), dpi=600)
