In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pymongo import MongoClient
import os
import warnings
import sys
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")

library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

PLOTS_PATH = os.path.join(library_path, 'plots')

print("Libraries imported successfully!")
print(f"Current working directory: {os.getcwd()}")

In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["Diagnosis_Severity_PD_Voice"]
collection = db["studies"]

print("ðŸ”„ Loading studies from MongoDB...")
fields_to_extract = {
    "doi"             : 1, 
    "year"            : 1, 
    "study_id"        : 1,
    "ml_approaches"   : 1,
    "problem"         : 1,
    "ml_problem_type" : 1,
    '_id'             : 0
}  # 1 = include, 0 = exclude
studies_cursor = collection.find({}, fields_to_extract)
studies_list = list(studies_cursor)


print(f"ðŸ“Š Total studies loaded: {len(studies_list)}")
print(f"ðŸ“„ Sample document keys: {list(studies_list[0].keys()) if studies_list else 'No documents found'}")

In [None]:
def get_metrics_experiment(experiment_list: list):
    
    metric_list = []

    for experiment in experiment_list:

        metrics = experiment.get('results', {})
        model = experiment.get('algorithm','')
        validation = experiment.get('validation', '')
        metric_list.append(tuple((model, metrics, validation)))

    return metric_list

In [None]:
experiment_df = pd.DataFrame(studies_list)

experiment_df['alg_metrics'] = experiment_df['ml_approaches'].apply(get_metrics_experiment)
experiment_df = experiment_df.explode('alg_metrics').reset_index(drop=True)
experiment_df['algorithm'] = experiment_df['alg_metrics'].apply(lambda x: x[0])
experiment_df['metrics'] = experiment_df['alg_metrics'].apply(lambda x: x[1])
experiment_df['validation'] = experiment_df['alg_metrics'].apply(lambda x: x[2])
experiment_df = experiment_df.drop(columns=['alg_metrics'])
experiment_df

In [None]:
experiment_df['metric_used'] = experiment_df['metrics'].apply(lambda x: list(set(list(x.keys()) if isinstance(x, dict) else [])))
experiment_df

In [None]:
experiment_df['validation'].value_counts()

In [None]:
metric_val_df = experiment_df.groupby(by=['doi', 'problem'], as_index=False).agg({
    'metric_used': list,
    'validation': list,
})
metric_val_df['metrics_used'] = metric_val_df['metric_used'].apply(lambda x: set([item for sublist in x for item in sublist]))
metric_val_df['num_metrics'] = metric_val_df['metrics_used'].apply(lambda x: len(x))
metric_val_df['validation'] = metric_val_df['validation'].apply(lambda x: list(set(x)))
metric_val_df['validation'] = metric_val_df['validation'].apply(lambda x: [val for val in x if val != ''])
metric_val_df['validation'] = metric_val_df['validation'].apply(lambda x: [val for val in x if val is not None])
metric_val_df['num_validation'] = metric_val_df['validation'].apply(lambda x: len(x))
metric_val_df

In [None]:
num_validation = metric_val_df.groupby(by='doi', as_index=False).agg({
    'num_validation': 'sum'})
num_validation['num_validation'].value_counts(normalize=True)*100

In [None]:
vals_df = metric_val_df.groupby(by='doi', as_index=False).agg({
    'validation': list})
vals_df['validation_used'] = vals_df['validation'].apply(lambda x: list(set([item for sublist in x for item in sublist])))
vals_df['num_validation_used'] = vals_df['validation_used'].apply(lambda x: len(x))
vals_df = vals_df.drop(columns=['validation'], inplace=False)
vals_df['num_validation_used'].value_counts(normalize=True)*100



In [None]:
temp_df = vals_df.explode('validation_used').reset_index(drop=True)
temp = temp_df['validation_used'].value_counts(normalize=True).reset_index()
temp.columns = ['validation', 'proportion']
temp['proportion'] = np.round(temp['proportion']*100, 2)
temp

In [None]:
num_metrics = metric_val_df.groupby(by='doi', as_index=False).agg({
    'num_metrics': 'sum'})

print(f"Max number of metrics in a paper: {num_metrics['num_metrics'].max()}")
print(f"Min number of metrics in a paper: {num_metrics['num_metrics'].min()}")
print(f"Median number of metrics in a paper: {num_metrics['num_metrics'].median():.2f}")

In [None]:
num_metrics[num_metrics['num_metrics'] == num_metrics['num_metrics'].min()].shape[0]

In [None]:
38/260