In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pymongo import MongoClient
import os
import warnings
import sys
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")

library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

PLOTS_PATH = os.path.join(library_path, 'plots')

print("Libraries imported successfully!")
print(f"Current working directory: {os.getcwd()}")

In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["Diagnosis_Severity_PD_Voice"]
collection = db["studies"]

print("ðŸ”„ Loading studies from MongoDB...")
fields_to_extract = {
    "doi"             : 1,
    "title"           : 1,
    "year"            : 1, 
    "study_id"        : 1,
    "ml_approaches"   : 1,
    '_id'             : 0
}  # 1 = include, 0 = exclude
studies_cursor = collection.find({}, fields_to_extract)
studies_list = list(studies_cursor)


print(f"ðŸ“Š Total studies loaded: {len(studies_list)}")
print(f"ðŸ“„ Sample document keys: {list(studies_list[0].keys()) if studies_list else 'No documents found'}")

In [None]:
def get_feature_extraction(experiment_list:list)->list:

    feat_extract = []

    for experiment in experiment_list:
        extraction_pipe = experiment.get('feature_extraction')
        if extraction_pipe is None:
            extraction_pipe = []
        feat_extract += extraction_pipe

    return list(set(feat_extract))

def experiments_with_feat_ext(experiment_list: list)->int:

    count = 0
    for experiment in experiment_list:
        if experiment.get('feature_extraction') is not None:
            count+=1
    return count

In [None]:
experiment_df = pd.DataFrame(studies_list)
experiment_df['num_experiments'] = experiment_df['ml_approaches'].apply(lambda x: len(x))
experiment_df['exp_with_feat_extract'] = experiment_df['ml_approaches'].apply(lambda x: experiments_with_feat_ext(x))
experiment_df.head()

In [None]:
num_experiments = experiment_df['num_experiments'].sum()
num_experiments_with_feat_ext = experiment_df['exp_with_feat_extract'].sum()

print(f"Total number of experiments: {num_experiments}")
print(f"Number of experiments with feature extracttion: {num_experiments_with_feat_ext}")
print(f"NUmber of exoeriments with out feature extraction: {num_experiments-num_experiments_with_feat_ext}")
print(f"Percentage of experiments with feature extraction: {num_experiments_with_feat_ext/num_experiments*100:.2f}%")
print(f"Number of papers: {experiment_df['doi'].nunique()}")

In [None]:
def clean_technique_name(technique):
    """
    Clean technique names by extracting the base strategy.
    Handles both colon-separated and parentheses-separated details.
    """
    if not isinstance(technique, str):
        return str(technique)
    
    # Remove leading/trailing whitespace
    cleaned = technique.strip()
    
    # Handle specific name standardizations first
    if cleaned == "Short Term Fourier Transform" or cleaned == "Short Time Fourier Transform" or cleaned == "Short-Term Fourier Transform":
        cleaned = "Short-Time Fourier Transform"

    if cleaned == "Wave2Vec2" or cleaned == "Wav2Vec" or cleaned == "Wave2Vec2.0" or cleaned == "Wav2Vec2":
        cleaned = "Wave2Vec"
    if "wav2vec" in cleaned.lower():
        cleaned = "Wave2Vec"

    if cleaned == "Articulation features":
        cleaned = "Articulation Features"

    if "resnet" in cleaned.lower():
        cleaned = "ResNet"

    if "Phonation features" == cleaned:
        cleaned = "Phonation Features"
    
    # Spectrogram harmonization
    if (cleaned == "Spectograms" or 
        cleaned == "Spectrogram-color" or 
        cleaned == "Spectrogram-grayscale" or 
        cleaned == "Gray Scale Spectrogram" or
        cleaned == "Color Spectrogram"):
        cleaned = "Spectrogram"
    
    # Mel-Spectrogram harmonization
    if cleaned == "Mel-Spectogram" or cleaned == "Mel-Spectrogram" or cleaned == "Mel Spectrogram" or cleaned == "Mel Spectrograms":
        cleaned = "Mel-Spectrogram"
    
    # MFCC harmonization
    if (cleaned == "Mel Frequency Cepstral Coefficients" or 
        cleaned == "Mel-Frequency Cepstral Coefficients-mean" or 
        cleaned == "Mel-Frequency Cepstral Coefficients-variance" or 
        cleaned == "Mel-Frequency Cepstrum Coefficients"):
        cleaned = "Mel-Frequency Cepstral Coefficients"

    if "First Three Formants" in cleaned:
        cleaned = "First Three Formants"

    # Handle colon-separated details (e.g., "Wavelet Transform: Daubechies")
    if ':' in cleaned:
        cleaned = cleaned.split(':')[0].strip()
    
    # Handle parentheses details (e.g., "MFCC (Mel-Frequency Cepstral Coefficients)")
    if '(' in cleaned:
        cleaned = cleaned.split('(')[0].strip()
    
    # Handle bracket details (e.g., "FFT [Fast Fourier Transform]")
    if '[' in cleaned:
        cleaned = cleaned.split('[')[0].strip()
    
    # Additional cleanup for common patterns
    # Remove trailing dashes or other separators
    cleaned = cleaned.rstrip(' -â€“â€”_')
    
    return cleaned

In [None]:
experiment_df['feat_extraction'] = experiment_df['ml_approaches'].apply(
    lambda x: get_feature_extraction(x)
)

extraction_df = experiment_df.explode('feat_extraction')
extraction_df.head(7)

In [None]:
extraction_df['cleaned_technique'] = extraction_df['feat_extraction'].apply(clean_technique_name)
extraction_df.head(7)

In [None]:
print(f"Total unique techniques: {extraction_df['cleaned_technique'].nunique():,}")
print(f"Total of papers: {extraction_df['doi'].nunique():,}")

In [None]:
tech_by_paper = extraction_df.groupby(by='doi', as_index=False).agg({'cleaned_technique': set})
tech_by_paper = tech_by_paper.explode('cleaned_technique')

In [None]:
# Correct logic: use sets to find mutually exclusive and overlapping groups
mask_no_feature_extraction = (tech_by_paper['cleaned_technique'] == 'nan')
mask_feature_extraction = (tech_by_paper['cleaned_technique'] != 'nan')

dois_no_feat_ext = set(tech_by_paper[mask_no_feature_extraction]['doi'])
dois_feat_ext = set(tech_by_paper[mask_feature_extraction]['doi'])

only_no_feat_ext = dois_no_feat_ext - dois_feat_ext
only_feat_ext = dois_feat_ext - dois_no_feat_ext
both_types = dois_no_feat_ext & dois_feat_ext

print(f"Total of papers without feature extraction only: {len(only_no_feat_ext):,}")
print(f"Papers with feature extraction only: {len(only_feat_ext):,}")
print(f"Papers with both feature extraction and no feature extraction entries: {len(both_types):,}")

In [None]:
tech_by_paper = tech_by_paper[tech_by_paper['doi'].isin(dois_feat_ext)].reset_index(drop=True)
tech_by_paper.head()

In [None]:
technique_rank = tech_by_paper['cleaned_technique'].value_counts().reset_index()
technique_rank.columns = ['technique', 'count']
technique_rank['percentage'] = np.round(technique_rank['count'] / technique_rank['count'].sum() * 100, 2)
technique_rank.head(10)

In [None]:
import textwrap

def break_long_names(name, max_len=22, max_lines=3):
    wrapped = textwrap.wrap(
        name,
        width=max_len,
        break_long_words=False,
        break_on_hyphens=False
    )

    # Limit number of lines
    if len(wrapped) > max_lines:
        wrapped = wrapped[:max_lines]
        wrapped[-1] += "â€¦"

    return "\n".join(wrapped)

In [None]:
top10_techniques = technique_rank.head(10)
from matplotlib.ticker import MaxNLocator


# Seaborn style
sns.set_style("whitegrid")

# Figure size (single-column journal size)
plt.figure(figsize=(9, 6))

# Calculate percentages
percentages = top10_techniques['percentage']

# Bar plot
ax = sns.barplot(
    x=top10_techniques['count'],
    y=top10_techniques['technique'],
    color="#4C72B0",
    edgecolor="black"
)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

# Labels and title
ax.set_title("Most Used Feature Extraction Techniques", fontsize=18, pad=12)
ax.set_xlabel("Number of Papers", fontsize=14)
ax.set_ylabel("Feature Extraction Technique", fontsize=14)

# Ticks
ax.tick_params(axis='both', labelsize=11)

# Add value and percentage labels on bars (closer to the end of the bars)
for i, (v, pct) in enumerate(zip(top10_techniques['count'], percentages)):
    ax.text(v - 0.5, i, f"{pct:.1f}%", va='center', ha='right', fontsize=12, fontweight='bold', color='white')

# Improve grid appearance
ax.grid(axis='x', color="#E5E5E5")
ax.grid(axis='y', visible=False)

# Tight layout for clean export
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_PATH, 'feat_ext_barplot.svg'), dpi=600)


In [None]:
mult_techniques = tech_by_paper.groupby('doi').agg({'cleaned_technique': set})
mult_techniques['num_techniques'] = mult_techniques['cleaned_technique'].apply(lambda x: len(x))
mult_techniques.head()

In [None]:
#num_no_feature = mask_no_feature_extraction.sum()
num_one_technique = mult_techniques[mult_techniques['num_techniques'] == 1].shape[0]
num_multiple_techniques = mult_techniques[mult_techniques['num_techniques'] > 1].shape[0]

In [None]:
# Create data for the stacked bar plot (single horizontal stacked bar, no y-label)
categories = ['']  # Empty string to hide y-label
counts = [len(only_no_feat_ext), num_one_technique, num_multiple_techniques]
labels = ['No Feature Extraction', 'One Technique', 'Multiple Techniques']

# Calculate percentages
total_papers = sum(counts)
percentages = [count/total_papers*100 for count in counts]

# Create the horizontal stacked bar plot
plt.figure(figsize=(9, 2))

# Colors for each segment
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

lefts = 0
bars = []
for i, (count, color, label, pct) in enumerate(zip(counts, colors, labels, percentages)):
    bar = plt.barh(categories, count, left=lefts, color=color, edgecolor='black', linewidth=1.5, label=f'{label} ({count}, {pct:.1f}%)')
    # Add value and percentage label in the middle of each segment
    plt.text(lefts + count/2, 0, f'{count}\n({pct:.1f}%)', va='center', ha='center', fontsize=11, fontweight='bold', color='white')
    lefts += count

# Customize the plot
plt.title('Distribution of Feature Extraction Usage in Papers', fontsize=16, pad=20)
plt.xlabel('Number of Papers', fontsize=14)
plt.yticks([])  # Remove y-tick label

# Add legend
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=11)

# Set x-axis to show integers only
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))

# Add grid for better readability
plt.grid(axis='x', alpha=0.3, linestyle='--')

# Adjust layout
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_PATH, 'feat_ext_stacked.svg'), dpi=600)

# Display summary statistics
print(f"Total papers analyzed: {total_papers}")
print(f"Papers with no feature extraction: {len(only_no_feat_ext)} ({percentages[0]:.1f}%)")
print(f"Papers with one technique: {num_one_technique} ({percentages[1]:.1f}%)")
print(f"Papers with multiple techniques: {num_multiple_techniques} ({percentages[2]:.1f}%)")

In [None]:
# Temporal analysis

top5_techniques = top10_techniques.head(5)

df_temporal = pd.merge(
    tech_by_paper, 
    experiment_df.groupby(by='doi').agg({'year': 'first'}).reset_index(), 
    on='doi', how='left'
)
df_temporal = df_temporal[df_temporal['cleaned_technique'].isin(top5_techniques['technique'])].reset_index(drop=True)
df_temporal

In [None]:
# Count techniques by year
temporal_counts = df_temporal.groupby(['year', 'cleaned_technique']).size().reset_index(name='count')

# Create a pivot table for easier plotting
temporal_pivot = temporal_counts.pivot(index='year', columns='cleaned_technique', values='count').fillna(0)

# Sort columns by total usage (descending)
col_order = temporal_pivot.sum().sort_values(ascending=False).index
temporal_pivot = temporal_pivot[col_order]

temporal_pivot

In [None]:
# Create line plot showing temporal trends
plt.figure(figsize=(12, 7))

# Plot each technique
for technique in temporal_pivot.columns:
    plt.plot(temporal_pivot.index, temporal_pivot[technique], marker='o', linewidth=2, label=technique)

# Customize the plot
plt.title('Temporal Trends of Top 10 Feature Extraction Techniques', fontsize=16, pad=20)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Papers', fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.grid(True, alpha=0.3, linestyle='--')

# Set integer ticks for both axes
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))

plt.tight_layout()
plt.savefig(os.path.join(PLOTS_PATH, 'feat_ext_temporal_trends.svg'), dpi=600)

In [None]:
# Alternative: Stacked bar plot for temporal trends
plt.figure(figsize=(12, 7))

bottoms = np.zeros(len(temporal_pivot.index))
for technique in temporal_pivot.columns:
    plt.bar(temporal_pivot.index, temporal_pivot[technique], bottom=bottoms, label=technique)
    bottoms += temporal_pivot[technique].values

# Customize the plot
plt.title('Temporal Trends of Top 5 Feature Extraction Techniques (Stacked Bar)', fontsize=16, pad=20)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Papers', fontsize=14)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.grid(True, alpha=0.3, linestyle='--', axis='y')

# Set integer ticks
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))

plt.tight_layout()
plt.savefig(os.path.join(PLOTS_PATH, 'feat_ext_temporal_stacked_bar.svg'), dpi=600)

In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["Diagnosis_Severity_PD_Voice"]
collection = db["studies"]

print("ðŸ”„ Loading studies from MongoDB...")
fields_to_extract = {
    "doi"             : 1, 
    "year"            : 1, 
    "study_id"        : 1,
    "ml_approaches"   : 1,
    "source_dataset"   : 1,
    '_id'             : 0
}  # 1 = include, 0 = exclude
studies_cursor = collection.find({}, fields_to_extract)
studies_list = list(studies_cursor)


print(f"ðŸ“Š Total studies loaded: {len(studies_list)}")
print(f"ðŸ“„ Sample document keys: {list(studies_list[0].keys()) if studies_list else 'No documents found'}")

In [None]:
experiment_df = pd.DataFrame(studies_list)
experiment_df['exp_with_feat_extract'] = experiment_df['ml_approaches'].apply(lambda x: experiments_with_feat_ext(x))
experiment_df['feat_extraction'] = experiment_df['ml_approaches'].apply(
    lambda x: get_feature_extraction(x)
)
experiment_df.head()

In [None]:
experiment_df = experiment_df[experiment_df['exp_with_feat_extract'] > 0].reset_index(drop=True)
experiment_df['source_dataset'] = experiment_df['source_dataset'].apply(
    lambda x: x[0].get('name')
)
experiment_df.head()

In [None]:
experiment_df = experiment_df.explode('feat_extraction')

In [None]:
short = experiment_df[['doi', 'source_dataset', 'feat_extraction']].copy()

In [None]:
short[short['source_dataset'] == "Oxford Parkinson's Disease Detection Dataset"].reset_index(drop=True)


In [None]:
df_temporal[df_temporal['cleaned_technique']=='Wave2Vec']