In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pymongo import MongoClient
import os
import warnings
import sys
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")

library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

PLOTS_PATH = os.path.join(library_path, 'plots')

print("Libraries imported successfully!")
print(f"Current working directory: {os.getcwd()}")

In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["Diagnosis_Severity_PD_Voice"]
collection = db["studies"]

print("ðŸ”„ Loading studies from MongoDB...")
fields_to_extract = {
    "doi"              : 1, 
    "year"             : 1, 
    "study_id"         : 1,
    "ml_approaches"    : 1,
    "problem"          : 1,
    'feature_selection': 1,
    '_id'              : 0
}  # 1 = include, 0 = exclude
studies_cursor = collection.find({}, fields_to_extract)
studies_list = list(studies_cursor)


print(f"ðŸ“Š Total studies loaded: {len(studies_list)}")
print(f"ðŸ“„ Sample document keys: {list(studies_list[0].keys()) if studies_list else 'No documents found'}")

In [None]:
def get_algorithms_and_selection(experiment_list:list)->list:

    algorithm_list = []

    for experiment in experiment_list:

        algorithms = experiment.get('algorithm')
        selection = experiment.get('feature_selection')

        if selection is not None:
            selection = selection.get('methods', [])
            

        else:
            selection = ['']

        if len(selection) == 1:
            pair = (algorithms, selection[0])
            algorithm_list.append(pair)
        else:
            for sel in selection:
                pair = (algorithms, sel)
                algorithm_list.append(pair)

    return list(set(algorithm_list))

def clean_selection_technique_name(technique):
    """
    Clean technique names by extracting the base strategy.
    Handles both colon-separated and parentheses-separated details.
    """
    if not isinstance(technique, str):
        return str(technique)
    
    # Remove leading/trailing whitespace
    cleaned = technique.strip()
    
    # Handle colon-separated details (e.g., "Recursive Feature Elimination: XGBoost")
    if ':' in cleaned:
        cleaned = cleaned.split(':')[0].strip()
    
    # Handle parentheses details (e.g., "PCA (Principal Component Analysis)")
    if '(' in cleaned:
        cleaned = cleaned.split('(')[0].strip()
    
    # Handle bracket details (e.g., "LASSO [L1 Regularization]")
    if '[' in cleaned:
        cleaned = cleaned.split('[')[0].strip()
    
    # Additional cleanup for common patterns
    # Remove trailing dashes or other separators
    cleaned = cleaned.rstrip(' -â€“â€”_')

    if cleaned.lower() == 't-test':
        cleaned = 'T-test'

    if cleaned.lower() == 'borutta':
        cleaned = 'Boruta'

    if cleaned.lower() == 'extra trees':
        cleaned = 'Extra Tree'

    if cleaned.lower() == 'f-score':
        cleaned = 'Fisher Score'

    if cleaned.lower() == 'relief':
        cleaned = 'ReliefF'

    if cleaned == 'Gray Wolf Optimization':
        cleaned = 'Grey Wolf Optimization'

    if cleaned == 'Sequential Forward Selection' or cleaned == 'Sequential Feature Selection':
        cleaned = 'Sequential Forward Feature Selection'

    if cleaned == 'Backward Stepwise Regression' or cleaned == 'Sequential Backward Selection':
        cleaned = 'Backward Stepwise Selection'
    
    return cleaned

def clean_algorithm_name(technique):
    """
    Clean technique names by extracting the base strategy.
    Handles both colon-separated and parentheses-separated details.
    """
    if not isinstance(technique, str):
        return str(technique)
    
    # Remove leading/trailing whitespace
    cleaned = technique.strip()
    
    # Handle specific name standardizations first
    if (cleaned == "CDIL-CNN Experiment 3" or 
        cleaned == "CDIL-CNN Experiment 1" or 
        cleaned == "CDIL-CNN Experiment 2"):
        cleaned = "CDIL-CNN"

    if (cleaned == "InceptionTime Experiment 1" or 
        cleaned == "InceptionTime Experiment 2" or 
        cleaned == "InceptionTime Experiment 3"):
        cleaned = "InceptionTime"

    if (cleaned == "LSTM-FCN Experiment 1" or 
        cleaned == "LSTM-FCN Experiment 2" or 
        cleaned == "LSTM-FCN Experiment 3"):
        cleaned = "LSTM-FCN"

    # Additional cleanup for common patterns
    # Remove trailing dashes or other separators
    cleaned = cleaned.rstrip(' -â€“â€”_')
    
    return cleaned

In [None]:
experiment_df = pd.DataFrame(studies_list)

In [None]:
experiment_df['pairs'] = experiment_df['ml_approaches'].apply(
    lambda x: get_algorithms_and_selection(x)
)
algorithm_df = experiment_df.explode('pairs')
algorithm_df

In [None]:
algorithm_df['algorithms'] = algorithm_df['pairs'].apply(lambda x: x[0])
algorithm_df['feat_selection'] = algorithm_df['pairs'].apply(lambda x: x[1])
algorithm_df

In [None]:
algorithm_df['base_algorithm'] = algorithm_df['algorithms'].apply(
    lambda x: x.split(':')[0] if isinstance(x, str) else x)
algorithm_df['base_algorithm'] = algorithm_df['base_algorithm'].apply(lambda x: x.split('(')[0].strip())
algorithm_df.head()

In [None]:
algorithm_df['cleaned_technique'] = algorithm_df['feat_selection'].apply(
    lambda x: clean_selection_technique_name(x)
)
algorithm_df['cleaned_algorithm'] = algorithm_df['base_algorithm'].apply(
    lambda x: clean_algorithm_name(x)
)
algorithm_df.head()

In [None]:
short_df = algorithm_df[[
    'cleaned_algorithm', 'cleaned_technique', 'doi'
]].copy()
short_df.head()

In [None]:
classical_ml = {
    'K-Nearest Neighbors': 'Classical ML',
    'Cosine K-Nearest Neighbors': 'Classical ML',
    'Weighted K-Nearest Neighbors': 'Classical ML',
    'Adaptive K-Nearest Neighbors': 'Classical ML',
    'Hyperplane K-Nearest Neighbor': 'Classical ML',
    'Fuzzy K-Nearest Neighbors': 'Classical ML',
    'Support Vector Machine': 'Classical ML',
    'C-Support Vector Machine': 'Classical ML',
    'Nu-Support Vector Machine': 'Classical ML',
    'Linear Discriminant Analysis': 'Classical ML',
    'Quadratic Discriminant Analysis': 'Classical ML',
    'Linear Discriminant': 'Classical ML',
    'Discriminant Analysis': 'Classical ML',
    'Naive Bayes': 'Classical ML',
    'Decision Tree': 'Classical ML',
    'Fine Tree': 'Classical ML',
    'CR-tree': 'Classical ML',
    'Decision Tree Induction': 'Classical ML',
    'Logistic Regression': 'Classical ML',
    'Linear Regression': 'Classical ML',
    'Ridge': 'Classical ML',
    'LASSO': 'Classical ML',
    'Stochastic Gradient Descent': 'Classical ML',
    'Gaussian Process': 'Classical ML',
    'Parzen Window Estimator': 'Classical ML',
    'Kernel Density Estimation': 'Classical ML',
    'Least Square Linear Basis Function': 'Classical ML',
    'Least Square Support Vector Machine': 'Classical ML',
    'Robust Least Squares Support Vector Machine': 'Classical ML',
    'Adaptive Linear k-SVM': 'Classical ML',
    'Relevance Vector Machine': 'Classical ML',
    'Multiclass Relevance Vector Machine': 'Classical ML',
    'Non-Negative Least Squares': 'Classical ML',
    'Proposed Non-Negative Least Squares': 'Classical ML',
    'L1-regularized Least Squares': 'Classical ML',
    'Proposed L1-regularized Least Squares': 'Classical ML',
    'Passive-Aggressive': 'Classical ML',
    'Hidden Markov Model': 'Classical ML',
    'Gower Distance Classifier': 'Classical ML',
    'Clark Distance Classifier': 'Classical ML',
    'Chi-squared Automatic Interaction Detection': 'Classical ML',
    'Quick, Unbiased, Efficient Statistical Tree': 'Classical ML',
    'Reliability-based Regression Model': 'Classical ML',
    'Proximal Alternating Linearized Minimization': 'Classical ML',
    'Dual Augmented Lagrangian Method': 'Classical ML',
    'Homotopy': 'Classical ML',
    'Approximate Message Passing': 'Classical ML',
    'Slo': 'Classical ML',
    'Multilayer Perceptron': 'Deep Learning',
    'Quantized Multilayer Perceptron': 'Deep Learning',
    'Neural Network': 'Deep Learning',
    'Artificial Neural Network': 'Deep Learning',
    'Aritificial Neural Network': 'Deep Learning',
    'Deep Neural Network': 'Deep Learning',
    'Convolutional Neural Network': 'Deep Learning',
    'Fuzzy Convolutional Neural Network': 'Deep Learning',
    'Quantized Convolutional Neural Network': 'Deep Learning',
    'Quantized Contempo Neural Network': 'Deep Learning',
    'Recurrent Neural Network': 'Deep Learning',
    'Layer Recurrent Neural Network': 'Deep Learning',
    'Feed-forward Neural Network': 'Deep Learning',
    'Long Short-Term Memory': 'Deep Learning',
    'Bidirectional Long Short-Term Memory': 'Deep Learning',
    'Graph Long Short-Term Memory': 'Deep Learning',
    'Attention Long Short Term Memory': 'Deep Learning',
    'Gated Recurrent Unit': 'Deep Learning',
    'Bidirectional Gated Recurrent Unit': 'Deep Learning',
    'Adaptive Momentum Backpropagation Neural Network': 'Deep Learning',
    'Extreme Learning Machine': 'Deep Learning',
    'Radial Basis Function Networks': 'Deep Learning',
    'Probabilistic Neural Network': 'Deep Learning',
    'Deep Belief Network': 'Deep Learning',
    'Fused Neural Network': 'Deep Learning',
    'ResNet': 'Deep Learning',
    'DenseNet161': 'Deep Learning',
    'SqueezeNet1_1': 'Deep Learning',
    'Xception': 'Deep Learning',
    'GoogleNet': 'Deep Learning',
    'Inception V3': 'Deep Learning',
    'InceptionResNetV2': 'Deep Learning',
    'ZF-Net': 'Deep Learning',
    'Vision Transformer': 'Deep Learning',
    'Swin': 'Deep Learning',
    'Audio Spectrogram Transformer': 'Deep Learning',
    'TabNet': 'Deep Learning',
    'Temporal Convolutional Network': 'Deep Learning',
    'CDIL-CNN': 'Deep Learning',
    'LSTM-FCN': 'Deep Learning',
    'InceptionTime': 'Deep Learning',
    'Legendre Memory Unit': 'Deep Learning',
    'Generative Adversarial Network': 'Deep Learning',
    'Transition Propagation Graph Neural Network': 'Deep Learning',
    'Whisper-small': 'Deep Learning',
    'Wav2Vec2': 'Deep Learning',
    'XLSR': 'Deep Learning',
    'Cross-Attention-based Fusion Model': 'Deep Learning',
    'Adaptive Neuro-Fuzzy Inference System': 'Deep Learning',
    'Artificial Neuro Fuzzy Inference System': 'Deep Learning',
    'Neuro-Fuzzy Decision Tree': 'Deep Learning',
    'Neuro-fuzzy Classifier': 'Deep Learning',
    'Self-organizing Fuzzy Neural Network': 'Deep Learning',
    'Hybrid Fuzzy Inference System': 'Deep Learning',
    'Dynamic Evolving Neuro-fuzzy Inference System': 'Deep Learning',
    'Type-2 Sugeno Fuzzy Inference System': 'Deep Learning',
    'Fuzzy Cognitive Map': 'Deep Learning',
    'Optimized Fuzzy Based k-Nearest Neighbor': 'Deep Learning',
    'Fuzzy Convolution Bi-directional Long Short-Term Memory': 'Deep Learning',
    'Softmax': 'Deep Learning',
    'Sparse Broad Transfer Learning': 'Deep Learning',
'Random Forest': 'Ensemble',
    'Extra Trees': 'Ensemble',
    'Extremely Randomized Trees': 'Ensemble',
    'AdaBoost': 'Ensemble',
    'Gradient Boosting': 'Ensemble',
    'XGBoost': 'Ensemble',
    'LightGBM': 'Ensemble',
    'CatBoost': 'Ensemble',
    'Causal CatBoost': 'Ensemble',
    'Bagging': 'Ensemble',
    'Bagged Trees': 'Ensemble',
    'Ensemble': 'Ensemble',
    'Stacking Ensemble': 'Ensemble',
    'Stacked Ensemble': 'Ensemble',
    'Classification and Regression Trees': 'Ensemble',
    'Ensembled Subspace Discriminant': 'Ensemble',
    'Ensembled Subspace K-Nearest Neighbors': 'Ensemble',
    'gcForest': 'Ensemble',
    'Penalizing Attributes Decision Forest': 'Ensemble',
    'Systematically Developed Forest': 'Ensemble',
    'Attention Weighted Random Forest': 'Ensemble',
    'Rotation Forest': 'Ensemble',
    'Random Subspace Ensemble': 'Ensemble',
    'Homogeneous XGBoost-SVM': 'Ensemble',
    'Homogeneous XGBoost-KNN': 'Ensemble',
    'Homogeneous XGBoost-RF': 'Ensemble',
    'Error Correcting Output Code': 'Ensemble',
    'Multiple Instance Learning': 'Ensemble'
}



In [None]:
short_df['ml_class'] = short_df['cleaned_algorithm'].map(classical_ml).fillna('Other')
short_df['has_selection'] = short_df['cleaned_technique'].apply(lambda x: 'No' if x == '' else 'Yes')
short_df.head()

In [None]:
short_df.groupby(['ml_class', 'has_selection']).size().unstack(fill_value=0)