In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
from pymongo import MongoClient
from collections import Counter, defaultdict
import os
import warnings
import sys
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")

library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

PLOTS_PATH = os.path.join(library_path, 'plots')

print("Libraries imported successfully!")
print(f"Current working directory: {os.getcwd()}")

In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["Diagnosis_Severity_PD_Voice"]
collection = db["studies"]

print("üîÑ Loading studies from MongoDB...")
fields_to_extract = {"doi": 1, "source_dataset": 1, "target_dataset": 1, '_id':0}  # 1 = include, 0 = exclude
studies_cursor = collection.find({}, fields_to_extract)
studies_list = list(studies_cursor)

print(f"üìä Total studies loaded: {len(studies_list)}")
print(f"üìÑ Sample document keys: {list(studies_list[0].keys()) if studies_list else 'No documents found'}")

In [None]:
# Explore the structure of source_dataset and target_dataset fields
print("üîç Analyzing dataset structure...")

if studies_list:
    # Get a sample document to examine structure
    sample_doc = studies_list[0]
    
    # Count documents with source_dataset and target_dataset
    docs_with_source = sum(1 for doc in studies_list if 'source_dataset' in doc and doc['source_dataset'])
    docs_with_target = sum(1 for doc in studies_list if 'target_dataset' in doc and doc['target_dataset'])
    
    print(f"\nüìä Dataset field coverage:")
    print(f"  Documents with source_dataset: {docs_with_source}/{len(studies_list)} ({docs_with_source/len(studies_list)*100:.1f}%)")
    print(f"  Documents with target_dataset: {docs_with_target}/{len(studies_list)} ({docs_with_target/len(studies_list)*100:.1f}%)")

In [None]:
# Analyze the datasets used in studies
print("üìä Analyzing datasets used in studies...")

# Collect all source and target datasets
all_source_datasets = []
all_target_datasets = []
dataset_usage = defaultdict(int)

for doc in studies_list:
    # Process source datasets
    if 'source_dataset' in doc and doc['source_dataset']:
        for dataset in doc['source_dataset']:
            all_source_datasets.append(dataset)
            if 'name' in dataset:
                dataset_usage[dataset['name']] += 1
    
    # Process target datasets
    if 'target_dataset' in doc and doc['target_dataset']:
        for dataset in doc['target_dataset']:
            all_target_datasets.append(dataset)

print(f"\nüìà Dataset statistics:")
print(f"  Total source dataset entries: {len(all_source_datasets)}")
print(f"  Total target dataset entries: {len(all_target_datasets)}")
print(f"  Unique dataset names: {len(dataset_usage)}")

# Show most commonly used datasets
print(f"\nüîù Top 10 most used datasets:")
top_datasets = sorted(dataset_usage.items(), key=lambda x: x[1], reverse=True)[:10]
for i, (name, count) in enumerate(top_datasets, 1):
    print(f"  {i:2d}. {name}: {count} studies")

In [None]:
global_dataset_usage = defaultdict(int)
all_source_datasets = defaultdict(int)
all_target_datasets = defaultdict(int)

for doc in studies_list:

    study_source = []
    study_target = []

    # Process source datasets
    if 'source_dataset' in doc and doc['source_dataset']:
        for dataset in doc['source_dataset']:
            study_source.append(dataset['name'])
            all_source_datasets[dataset['name']] += 1
    
    # Process target datasets
    if 'target_dataset' in doc and doc['target_dataset']:
        for dataset in doc['target_dataset']:
            study_target.append(dataset['name'])
            all_target_datasets[dataset['name']] += 1
    
    # Count each dataset once per study
    unique_dataset = set(study_source + study_target)
    for dataset_name in unique_dataset:
        global_dataset_usage[dataset_name] += 1



In [None]:
print(f"\nüìà Dataset statistics:")
print(f"\nüîù Top 10 most used datasets:")
top_datasets = sorted(global_dataset_usage.items(), key=lambda x: x[1], reverse=True)[:10]
for i, (name, count) in enumerate(top_datasets, 1):
    print(f"  {i:2d}. {name}: {count} studies")

print(f"\nüîù Top 5 most used source datasets:")
top_datasets = sorted(all_source_datasets.items(), key=lambda x: x[1], reverse=True)[:5]
for i, (name, count) in enumerate(top_datasets, 1):
    print(f"  {i:2d}. {name}: {count} studies")

print(f"\nüîù Top 5 most used target datasets:")
top_datasets = sorted(all_target_datasets.items(), key=lambda x: x[1], reverse=True)[:5]
for i, (name, count) in enumerate(top_datasets, 1):
    print(f"  {i:2d}. {name}: {count} studies")

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

def plot_top_datasets(dataset_counter, title, top_n=10):
    top = Counter(dataset_counter).most_common(top_n)
    names, counts = zip(*top)
    
    plt.figure(figsize=(10, 6))
    plt.barh(names, counts, color='skyblue')
    plt.xlabel('Count')
    plt.title(title)
    plt.gca().invert_yaxis()  # largest on top
    plt.show()

# Example usage:
plot_top_datasets(all_source_datasets, "Top Source Datasets")
plot_top_datasets(all_target_datasets, "Top Target Datasets")
plot_top_datasets(global_dataset_usage, "Dataset Usage Across Studies")

In [None]:
import pandas as pd

df = pd.DataFrame({
    'dataset': list(set(all_source_datasets) | set(all_target_datasets)),
    'source_count': [all_source_datasets[d] for d in set(all_source_datasets) | set(all_target_datasets)],
    'target_count': [all_target_datasets[d] for d in set(all_source_datasets) | set(all_target_datasets)],
})

df = df.sort_values('source_count', ascending=False).head(10)

df.plot(
    x='dataset', kind='bar', stacked=False, figsize=(12,6),
    color=['skyblue', 'salmon'], xlabel='Dataset', ylabel='Count', title='Top 10 Datasets: Source vs Target'
)
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
fields_to_extract = {"doi": 1, "source_dataset": 1, "target_dataset": 1, '_id':0}  # 1 = include, 0 = exclude
documents = collection.find({}, fields_to_extract)

unique_dois = collection.distinct("doi")

In [None]:
dataset_by_doi = []
for doi in unique_dois:
    doi_docs = collection.find({"doi": doi}, projection=fields_to_extract)
    summary_dict = {'doi': doi, 'datasets': set()}

    for doc in doi_docs:
        datasets = [
            d['name']
            for field in ('source_dataset', 'target_dataset')
            if field in doc and doc[field]
            for d in doc[field]
        ]
        summary_dict['datasets'].update(datasets)

    summary_dict['datasets'] = list(summary_dict['datasets'])
    dataset_by_doi.append(summary_dict)

In [None]:
df = pd.DataFrame(dataset_by_doi)
df.head()

In [None]:
df_exploded = df.explode('datasets')
df_exploded.head()

In [None]:
dataset_counts =df_exploded.groupby('datasets', as_index=False).size().sort_values(by='size', ascending=False)
dataset_counts.columns = ['dataset', 'count']
dataset_counts['percentage'] = np.round(dataset_counts['count'] / df['doi'].nunique() * 100, 2)

In [None]:
dataset_counts = dataset_counts.reset_index(drop=True)
dataset_counts.head(10)

In [None]:
df['num_datasets'] = df['datasets'].apply(len)
datasets_by_paper = df.groupby('num_datasets').size().reset_index(name='num_papers')
datasets_by_paper['percentages'] = np.round(datasets_by_paper['num_papers'] / df.shape[0] * 100, 2)
datasets_by_paper