In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pymongo import MongoClient
import os
import warnings
import sys
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")

library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

PLOTS_PATH = os.path.join(library_path, 'plots')

print("Libraries imported successfully!")
print(f"Current working directory: {os.getcwd()}")

In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["Diagnosis_Severity_PD_Voice"]
collection = db["studies"]

print("ðŸ”„ Loading studies from MongoDB...")
fields_to_extract = {
    "doi"             : 1, 
    "year"            : 1, 
    "study_id"        : 1,
    "ml_approaches"   : 1,
    "problem"         : 1,
    "ml_problem_type" : 1,
    '_id'             : 0
}  # 1 = include, 0 = exclude
studies_cursor = collection.find({}, fields_to_extract)
studies_list = list(studies_cursor)


print(f"ðŸ“Š Total studies loaded: {len(studies_list)}")
print(f"ðŸ“„ Sample document keys: {list(studies_list[0].keys()) if studies_list else 'No documents found'}")

In [None]:
def get_metrics_experiment(experiment_list: list):
    
    metric_list = []

    for experiment in experiment_list:

        metrics = experiment.get('results', {})
        model = experiment.get('algorithm','')
        validation = experiment.get('validation', '')
        metric_list.append(tuple((model, metrics, validation)))

    return metric_list

In [None]:
experiment_df = pd.DataFrame(studies_list)

experiment_df['alg_metrics'] = experiment_df['ml_approaches'].apply(get_metrics_experiment)
experiment_df = experiment_df.explode('alg_metrics').reset_index(drop=True)
experiment_df['algorithm'] = experiment_df['alg_metrics'].apply(lambda x: x[0])
experiment_df['metrics'] = experiment_df['alg_metrics'].apply(lambda x: x[1])
experiment_df['validation'] = experiment_df['alg_metrics'].apply(lambda x: x[2])
experiment_df = experiment_df.drop(columns=['alg_metrics'])
experiment_df

In [None]:
experiment_df['metric_used'] = experiment_df['metrics'].apply(lambda x: list(set(list(x.keys()) if isinstance(x, dict) else [])))
experiment_df

In [None]:
experiment_df['validation'].value_counts()

In [None]:
metric_val_df = experiment_df.groupby(by=['doi', 'problem'], as_index=False).agg({
    'metric_used': list,
    'validation': list,
})
metric_val_df['metrics_used'] = metric_val_df['metric_used'].apply(lambda x: set([item for sublist in x for item in sublist]))
metric_val_df['num_metrics'] = metric_val_df['metrics_used'].apply(lambda x: len(x))
metric_val_df['validation'] = metric_val_df['validation'].apply(lambda x: list(set(x)))
metric_val_df['validation'] = metric_val_df['validation'].apply(lambda x: [val for val in x if val != ''])
metric_val_df['validation'] = metric_val_df['validation'].apply(lambda x: [val for val in x if val is not None])
metric_val_df['num_validation'] = metric_val_df['validation'].apply(lambda x: len(x))
metric_val_df

In [None]:
num_validation = metric_val_df.groupby(by='doi', as_index=False).agg({
    'num_validation': 'sum'})
num_validation['num_validation'].value_counts(normalize=True)*100

In [None]:
vals_df = metric_val_df.groupby(by='doi', as_index=False).agg({
    'validation': list})
vals_df['validation_used'] = vals_df['validation'].apply(lambda x: list(set([item for sublist in x for item in sublist])))
vals_df['num_validation_used'] = vals_df['validation_used'].apply(lambda x: len(x))
vals_df = vals_df.drop(columns=['validation'], inplace=False)
vals_df['num_validation_used'].value_counts(normalize=True)*100



In [None]:
temp_df = vals_df.explode('validation_used').reset_index(drop=True)
temp = temp_df['validation_used'].value_counts(normalize=True).reset_index()
temp.columns = ['validation', 'proportion']
temp['proportion'] = np.round(temp['proportion']*100, 2)
temp

In [None]:
num_metrics = metric_val_df.groupby(by='doi', as_index=False).agg({
    'num_metrics': 'sum'})

print(f"Max number of metrics in a paper: {num_metrics['num_metrics'].max()}")
print(f"Min number of metrics in a paper: {num_metrics['num_metrics'].min()}")
print(f"Median number of metrics in a paper: {num_metrics['num_metrics'].median():.2f}")

In [None]:
num_metrics[num_metrics['num_metrics'] == num_metrics['num_metrics'].min()].shape[0]

In [None]:
experiment_df

In [None]:
def concat_str_lists(str_list):
    if isinstance(str_list, list):
        return ', '.join([s.strip() for s in str_list])
    return str_list

experiment_df['for_group_ml_type'] = experiment_df['ml_problem_type'].apply(lambda x: concat_str_lists(x))
experiment_df.groupby(by=['problem', 'for_group_ml_type'], as_index=False).agg(
    doi_unique=('doi', 'nunique'),
    doi_count=('doi', 'count')
)

In [None]:
exploded_metrics  =(experiment_df.assign(kv=experiment_df["metrics"].apply(lambda d: list(d.items())))
   .explode("kv")
   .assign(metric=lambda x: x["kv"].str[0],
           value=lambda x: x["kv"].str[1])
   .drop(columns=["kv"])
)
exploded_metrics = exploded_metrics.drop(columns=['year', 'ml_approaches', 'metrics', 'validation'], inplace=False)
exploded_metrics = exploded_metrics.explode('ml_problem_type').reset_index(drop=True)
exploded_metrics['ml_problem_type'] = exploded_metrics['ml_problem_type'].apply(lambda x: x.split(':')[0] if isinstance(x, str) else x)
exploded_metrics.head()

In [None]:
class_metrics = exploded_metrics[exploded_metrics['ml_problem_type'] == 'Classification'].reset_index(drop=True)
class_metrics

In [None]:
print(f"Number of metrics used in Classification problems: {class_metrics['metric'].nunique()}")

In [None]:
metric_count =class_metrics['metric'].value_counts(normalize=True).reset_index()
metric_count.columns = ['metric', 'percent']
metric_count['percent'] = np.round(metric_count['percent']*100, 2)
metric_count.head(7)

In [None]:
metric_count =class_metrics['metric'].value_counts(normalize=False).reset_index()
metric_count.columns = ['metric', 'count']
metric_count

In [None]:
metric_by_doi = class_metrics.groupby(by='metric', as_index=False).agg({
    'doi': 'nunique'
})
metric_by_doi.sort_values(by='doi', ascending=False, inplace=True)
metric_by_doi.rename(columns={'doi': 'num_doi'}, inplace=True)
metric_by_doi['percentage_doi'] = np.round((metric_by_doi['num_doi'] / class_metrics['doi'].nunique()) * 100, 2)
metric_by_doi.head(7)

In [None]:
print(f"Mean Accuracy reported in Classification problems: {class_metrics[class_metrics['metric'] == 'accuracy']['value'].mean():.2f}")
print(f"Std Deviation of Accuracy reported in Classification problems: {class_metrics[class_metrics['metric'] == 'accuracy']['value'].std():.2f}")
print(f"Median Accuracy reported in Classification problems: {class_metrics[class_metrics['metric'] == 'accuracy']['value'].median():.2f}")
print(f'Min Accuracy reported in Classification problems: {class_metrics[class_metrics['metric'] == 'accuracy']['value'].min():.2f}')
print(f'Max Accuracy reported in Classification problems: {class_metrics[class_metrics['metric'] == 'accuracy']['value'].max():.2f}')
print(f"First Quartile of Accuracy reported in Classification problems: {class_metrics[class_metrics['metric'] == 'accuracy']['value'].quantile(0.25):.2f}")
print(f"Third Quartile of Accuracy reported in Classification problems: {class_metrics[class_metrics['metric'] == 'accuracy']['value'].quantile(0.75):.2f}")

In [None]:
print(f"Mean Recall reported in Classification problems: {class_metrics[class_metrics['metric'] == 'recall']['value'].mean():.2f}")
print(f"Std Deviation of Recall reported in Classification problems: {class_metrics[class_metrics['metric'] == 'recall']['value'].std():.2f}")
print(f"Median Recall reported in Classification problems: {class_metrics[class_metrics['metric'] == 'recall']['value'].median():.2f}")
print(f'Min Recall reported in Classification problems: {class_metrics[class_metrics['metric'] == 'recall']['value'].min():.2f}')
print(f'Max Recall reported in Classification problems: {class_metrics[class_metrics['metric'] == 'recall']['value'].max():.2f}')
print(f"First Quartile of Recall reported in Classification problems: {class_metrics[class_metrics['metric'] == 'recall']['value'].quantile(0.25):.2f}")
print(f"Third Quartile of Recall reported in Classification problems: {class_metrics[class_metrics['metric'] == 'recall']['value'].quantile(0.75):.2f}")

In [None]:
print(f"Mean F1-score reported in Classification problems: {class_metrics[class_metrics['metric'] == 'f1_score']['value'].mean():.2f}")
print(f"Std Deviation of F1-score reported in Classification problems: {class_metrics[class_metrics['metric'] == 'f1_score']['value'].std():.2f}")
print(f"Median F1-score reported in Classification problems: {class_metrics[class_metrics['metric'] == 'f1_score']['value'].median():.2f}")
print(f'Min F1-score reported in Classification problems: {class_metrics[class_metrics['metric'] == 'f1_score']['value'].min():.2f}')
print(f'Max F1-score reported in Classification problems: {class_metrics[class_metrics['metric'] == 'f1_score']['value'].max():.2f}')
print(f"First Quartile of F1-score reported in Classification problems: {class_metrics[class_metrics['metric'] == 'f1_score']['value'].quantile(0.25):.2f}")
print(f"Third Quartile of F1-score reported in Classification problems: {class_metrics[class_metrics['metric'] == 'f1_score']['value'].quantile(0.75):.2f}")

In [None]:
print(f"Mean precision reported in Classification problems: {class_metrics[class_metrics['metric'] == 'precision']['value'].mean():.2f}")
print(f"Std Deviation of precision reported in Classification problems: {class_metrics[class_metrics['metric'] == 'precision']['value'].std():.2f}")
print(f"Median precision reported in Classification problems: {class_metrics[class_metrics['metric'] == 'precision']['value'].median():.2f}")
print(f'Min precision reported in Classification problems: {class_metrics[class_metrics['metric'] == 'precision']['value'].min():.2f}')
print(f'Max precision reported in Classification problems: {class_metrics[class_metrics['metric'] == 'precision']['value'].max():.2f}')
print(f"First Quartile of precision reported in Classification problems: {class_metrics[class_metrics['metric'] == 'precision']['value'].quantile(0.25):.2f}")
print(f"Third Quartile of precision reported in Classification problems: {class_metrics[class_metrics['metric'] == 'precision']['value'].quantile(0.75):.2f}")

In [None]:
print(f"Mean Specificity reported in Classification problems: {class_metrics[class_metrics['metric'] == 'specificity']['value'].mean():.2f}")
print(f"Std Deviation of Specificity reported in Classification problems: {class_metrics[class_metrics['metric'] == 'specificity']['value'].std():.2f}")
print(f"Median Specificity reported in Classification problems: {class_metrics[class_metrics['metric'] == 'specificity']['value'].median():.2f}")
print(f'Min Specificity reported in Classification problems: {class_metrics[class_metrics['metric'] == 'specificity']['value'].min():.2f}')
print(f'Max Specificity reported in Classification problems: {class_metrics[class_metrics['metric'] == 'specificity']['value'].max():.2f}')
print(f"First Quartile of Specificity reported in Classification problems: {class_metrics[class_metrics['metric'] == 'specificity']['value'].quantile(0.25):.2f}")
print(f"Third Quartile of Specificity reported in Classification problems: {class_metrics[class_metrics['metric'] == 'specificity']['value'].quantile(0.75):.2f}")

In [None]:
print(f"Mean ROC-AUC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'auc']['value'].mean():.2f}")
print(f"Std Deviation of ROC-AUC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'auc']['value'].std():.2f}")
print(f"Median ROC-AUC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'auc']['value'].median():.2f}")
print(f'Min ROC-AUC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'auc']['value'].min():.2f}')
print(f'Max ROC-AUC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'auc']['value'].max():.2f}')
print(f"First Quartile of ROC-AUC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'auc']['value'].quantile(0.25):.2f}")
print(f"Third Quartile of ROC-AUC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'auc']['value'].quantile(0.75):.2f}")

In [None]:
print(f"Mean MCC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'mcc']['value'].mean():.2f}")
print(f"Std Deviation of MCC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'mcc']['value'].std():.2f}")
print(f"Median MCC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'mcc']['value'].median():.2f}")
print(f'Min MCC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'mcc']['value'].min():.2f}')
print(f'Max MCC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'mcc']['value'].max():.2f}')
print(f"First Quartile of MCC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'mcc']['value'].quantile(0.25):.2f}")
print(f"Third Quartile of MCC reported in Classification problems: {class_metrics[class_metrics['metric'] == 'mcc']['value'].quantile(0.75):.2f}")

In [None]:
regression_metrics = exploded_metrics[exploded_metrics['ml_problem_type'] == 'Regression'].reset_index(drop=True)
regression_metrics

In [None]:
print(f"Number of metrics used in Regression problems: {regression_metrics['metric'].nunique()}")

In [None]:
metric_count =regression_metrics['metric'].value_counts(normalize=True).reset_index()
metric_count.columns = ['metric', 'percent']
metric_count['percent'] = np.round(metric_count['percent']*100, 2)
metric_count.head(6)

In [None]:
metric_count =regression_metrics['metric'].value_counts(normalize=False).reset_index()
metric_count.columns = ['metric', 'count']
metric_count

In [None]:
metric_by_doi = regression_metrics.groupby(by='metric', as_index=False).agg({
    'doi': 'nunique'
})
metric_by_doi.sort_values(by='doi', ascending=False, inplace=True)
metric_by_doi.rename(columns={'doi': 'num_doi'}, inplace=True)
metric_by_doi['percentage_doi'] = np.round((metric_by_doi['num_doi'] / regression_metrics['doi'].nunique()) * 100, 2)
metric_by_doi

In [None]:
print(f"Mean MAE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mae']['value'].mean():.2f}")
print(f"Std Deviation of MAE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mae']['value'].std():.2f}")
print(f"Median MAE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mae']['value'].median():.2f}")
print(f'Min MAE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mae']['value'].min():.2f}')
print(f'Max MAE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mae']['value'].max():.2f}')
print(f"First Quartile of MAE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mae']['value'].quantile(0.25):.2f}")
print(f"Third Quartile of MAE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mae']['value'].quantile(0.75):.2f}")

In [None]:
print(f"Mean RMSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'rmse']['value'].mean():.2f}")
print(f"Std Deviation of RMSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'rmse']['value'].std():.2f}")
print(f"Median RMSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'rmse']['value'].median():.2f}")
print(f'Min RMSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'rmse']['value'].min():.2f}')
print(f'Max RMSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'rmse']['value'].max():.2f}')
print(f"First Quartile of RMSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'rmse']['value'].quantile(0.25):.2f}")
print(f"Third Quartile of RMSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'rmse']['value'].quantile(0.75):.2f}")

In [None]:
print(f"Mean RÂ² reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'r_squared']['value'].mean():.2f}")
print(f"Std Deviation of RÂ² reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'r_squared']['value'].std():.2f}")
print(f"Median RÂ² reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'r_squared']['value'].median():.2f}")
print(f'Min RÂ² reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'r_squared']['value'].min():.2f}')
print(f'Max RÂ² reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'r_squared']['value'].max():.2f}')
print(f"First Quartile of RÂ² reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'r_squared']['value'].quantile(0.25):.2f}")
print(f"Third Quartile of RÂ² reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'r_squared']['value'].quantile(0.75):.2f}")

In [None]:
print(f"Mean MSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mse']['value'].mean():.2f}")
print(f"Std Deviation of MSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mse']['value'].std():.2f}")
print(f"Median MSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mse']['value'].median():.2f}")
print(f'Min MSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mse']['value'].min():.2f}')
print(f'Max MSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mse']['value'].max():.2f}')
print(f"First Quartile of MSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mse']['value'].quantile(0.25):.2f}")
print(f"Third Quartile of MSE reported in Regression problems: {regression_metrics[regression_metrics['metric'] == 'mse']['value'].quantile(0.75):.2f}")