In [1]:
# Import required libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pymongo import MongoClient
import os
import warnings
import sys
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")

library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

PLOTS_PATH = os.path.join(library_path, 'plots')

print("Libraries imported successfully!")
print(f"Current working directory: {os.getcwd()}")

Libraries imported successfully!
Current working directory: /home/luis/CGE/voice_db/analysis_notebooks


In [2]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["Diagnosis_Severity_PD_Voice"]
collection = db["studies"]

print("ðŸ”„ Loading studies from MongoDB...")
fields_to_extract = {
    "doi"             : 1, 
    "year"            : 1, 
    "study_id"        : 1,
    "ml_approaches"   : 1,
    '_id'             : 0
}  # 1 = include, 0 = exclude
studies_cursor = collection.find({}, fields_to_extract)
studies_list = list(studies_cursor)


print(f"ðŸ“Š Total studies loaded: {len(studies_list)}")
print(f"ðŸ“„ Sample document keys: {list(studies_list[0].keys()) if studies_list else 'No documents found'}")

ðŸ”„ Loading studies from MongoDB...
ðŸ“Š Total studies loaded: 381
ðŸ“„ Sample document keys: ['year', 'doi', 'ml_approaches', 'study_id']


In [4]:
def get_dimensionality_reduction(experiment_list:list)->list:

    dim_reduc = []

    for experiment in experiment_list:
        reduction = experiment.get('dimensionality_reduction')
        if reduction is None:
            reduction = []
        dim_reduc += reduction

    return list(set(dim_reduc))

def experiments_with_dim_reduc(experiment_list: list)->int:

    count = 0
    for experiment in experiment_list:
        if experiment.get('dimensionality_reduction') is not None:
            count+=1
    return count

In [5]:
experiment_df = pd.DataFrame(studies_list)
experiment_df['num_experiments'] = experiment_df['ml_approaches'].apply(lambda x: len(x))
experiment_df['exp_with_dim_reduc'] = experiment_df['ml_approaches'].apply(lambda x: experiments_with_dim_reduc(x))
experiment_df.head()

Unnamed: 0,year,doi,ml_approaches,study_id,num_experiments,exp_with_dim_reduc
0,2020,10.1109/ACCESS.2020.2974008,[{'algorithm': 'Convolutional Neural Network: ...,10.1109/ACCESS.2020.2974008_1,9,0
1,2020,10.2196/18689,"[{'algorithm': 'Support Vector Machine', 'fram...",10.2196/18689_1,4,0
2,2020,10.3233/JIFS-179714,[{'algorithm': 'Support Vector Machine: Linear...,10.3233/JIFS-179714_1,2,0
3,2020,10.1016/j.bbe.2019.05.005,"[{'algorithm': 'Support Vector Machine', 'fram...",10.1016/j.bbe.2019.05.005_2,4,0
4,2020,10.15439/2020F188,"[{'algorithm': 'Fine Tree', 'framework': 'MATL...",10.15439/2020F188_1,48,0


In [6]:
num_experiments = experiment_df['num_experiments'].sum()
num_experiments_with_dim_reduc = experiment_df['exp_with_dim_reduc'].sum()

print(f"Total number of experiments: {num_experiments}")
print(f"Number of experiments with dimensionality reduction: {num_experiments_with_dim_reduc}")
print(f"NNumber of experiments without dimensionality reduction: {num_experiments-num_experiments_with_dim_reduc}")
print(f"Percentage of experiments with dimensionality reduction: {num_experiments_with_dim_reduc/num_experiments*100:.2f}%")
print(f"Number of papers: {experiment_df['doi'].nunique()}")

Total number of experiments: 2431
Number of experiments with dimensionality reduction: 200
NNumber of experiments without dimensionality reduction: 2231
Percentage of experiments with dimensionality reduction: 8.23%
Number of papers: 260


In [8]:
experiment_df['dimensionality_reduction'] = experiment_df['ml_approaches'].apply(
    lambda x: get_dimensionality_reduction(x)
)

dim_reduc_df = experiment_df.explode('dimensionality_reduction')
dim_reduc_df.head(7)

Unnamed: 0,year,doi,ml_approaches,study_id,num_experiments,exp_with_dim_reduc,dimensionality_reduction
0,2020,10.1109/ACCESS.2020.2974008,[{'algorithm': 'Convolutional Neural Network: ...,10.1109/ACCESS.2020.2974008_1,9,0,
1,2020,10.2196/18689,"[{'algorithm': 'Support Vector Machine', 'fram...",10.2196/18689_1,4,0,
2,2020,10.3233/JIFS-179714,[{'algorithm': 'Support Vector Machine: Linear...,10.3233/JIFS-179714_1,2,0,
3,2020,10.1016/j.bbe.2019.05.005,"[{'algorithm': 'Support Vector Machine', 'fram...",10.1016/j.bbe.2019.05.005_2,4,0,
4,2020,10.15439/2020F188,"[{'algorithm': 'Fine Tree', 'framework': 'MATL...",10.15439/2020F188_1,48,0,
5,2020,10.1016/j.mehy.2020.109603,"[{'algorithm': 'Support Vector Machine', 'fram...",10.1016/j.mehy.2020.109603_1,3,0,
6,2020,10.1016/j.neucom.2020.03.058,[{'algorithm': 'Gaussian Process: ARD Exponent...,10.1016/j.neucom.2020.03.058_1,4,0,


In [9]:
dim_reduc_df['dimensionality_reduction'].value_counts()

dimensionality_reduction
PCA                             19
Kernel PCA                       2
Linear Discriminant Analysis     1
Singular Value Decomposition     1
Name: count, dtype: int64

In [19]:
dim_reduc_df[~dim_reduc_df['dimensionality_reduction'].isna()].groupby('doi').size().shape[0]

21

In [18]:
dim_reduc_df[~dim_reduc_df['dimensionality_reduction'].isna()].groupby('doi')\
.agg({'dimensionality_reduction': set}).explode('dimensionality_reduction')\
['dimensionality_reduction'].value_counts()

dimensionality_reduction
PCA                             17
Kernel PCA                       2
Singular Value Decomposition     1
Linear Discriminant Analysis     1
Name: count, dtype: int64

In [20]:
dim_reduc_df[~dim_reduc_df['dimensionality_reduction'].isna()]

Unnamed: 0,year,doi,ml_approaches,study_id,num_experiments,exp_with_dim_reduc,dimensionality_reduction
22,2020,10.1109/ICDABI51230.2020.9325709,[{'algorithm': 'Adaptive Momentum Backpropagat...,10.1109/ICDABI51230.2020.9325709_1,2,1,PCA
53,2021,10.1109/ICMSS53060.2021.9673634,"[{'algorithm': 'Random Forest', 'framework': '...",10.1109/ICMSS53060.2021.9673634_1,4,4,PCA
69,2021,10.1002/cpe.6419,"[{'algorithm': 'Adaptive Linear k-SVM', 'frame...",10.1002/cpe.6419_1,16,4,PCA
70,2021,10.1155/2021/8822069,[{'algorithm': 'Support Vector Machine: Linear...,10.1155/2021/8822069_1,4,4,Linear Discriminant Analysis
83,2021,10.1007/s41060-020-00234-0,[{'algorithm': 'Support Vector Machine: Linear...,10.1007/s41060-020-00234-0_1,43,11,PCA
87,2021,10.1007/s41060-020-00234-0,[{'algorithm': 'Support Vector Machine: Linear...,10.1007/s41060-020-00234-0_2,30,10,PCA
89,2021,10.1016/j.ifacol.2021.10.286,"[{'algorithm': 'Support Vector Machine: RBF', ...",10.1016/j.ifacol.2021.10.286_1,8,4,PCA
95,2022,10.32604/cmc.2022.023124,"[{'algorithm': 'Naive Bayes', 'framework': '',...",10.32604/cmc.2022.023124_1,30,6,PCA
107,2022,10.1155/2022/1487212,[{'algorithm': 'Fuzzy Convolution Bi-direction...,10.1155/2022/1487212_1,4,4,Kernel PCA
113,2022,10.1108/IJICC-10-2021-0226,"[{'algorithm': 'Fuzzy K-Nearest Neighbors', 's...",10.1108/IJICC-10-2021-0226_1,4,4,Singular Value Decomposition


In [21]:
dim_reduc_df[~dim_reduc_df['dimensionality_reduction'].isna()].groupby('doi')\
.agg({'dimensionality_reduction': set})

Unnamed: 0_level_0,dimensionality_reduction
doi,Unnamed: 1_level_1
10.1002/cpe.6419,{PCA}
10.1007/978-3-030-31129-2_15,{PCA}
10.1007/s12553-023-00810-x,{Kernel PCA}
10.1007/s13534-020-00156-7,{PCA}
10.1007/s41060-020-00234-0,{PCA}
10.1016/j.bbe.2024.08.009,{PCA}
10.1016/j.compbiolchem.2022.107788,{PCA}
10.1016/j.csbj.2025.06.022,{PCA}
10.1016/j.ifacol.2021.10.286,{PCA}
10.1016/j.procs.2023.01.007,{PCA}
