In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from plotly.subplots import make_subplots
from pymongo import MongoClient
import os
import warnings
import sys
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")

library_path = os.path.abspath('..')
if library_path not in sys.path:
    sys.path.append(library_path)

PLOTS_PATH = os.path.join(library_path, 'plots')

print("Libraries imported successfully!")
print(f"Current working directory: {os.getcwd()}")

In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["Diagnosis_Severity_PD_Voice"]
collection = db["studies"]

# Load all studies
print("üîÑ Loading studies from MongoDB...")
field_to_extract = {
    "doi"             : 1, 
    "journal"         : 1, 
    "authors"         : 1, 
    "publication_type": 1, 
    "year"            : 1, 
    "study_id"        : 1,
    "problem"         : 1,
    "ml_problem_type" : 1,
    '_id'             : 0
}  # 1 = include, 0 = exclude
studies_cursor = collection.find({}, field_to_extract)
studies_list = list(studies_cursor)

print(f"üìä Total studies loaded: {len(studies_list)}")
print(f"üìÑ Sample document keys: {list(studies_list[0].keys()) if studies_list else 'No documents found'}")
print("‚ÑπÔ∏è  Note: Each document represents a study - we'll aggregate these into papers for analysis")

In [None]:
studies_df = pd.DataFrame(studies_list)
studies_df.head()

In [None]:
print(f"Total articles in database: {studies_df['doi'].nunique()}")
print(f"Total studies in database: {len(studies_df)}")

In [None]:
print("Distribution in conference/journal papers:")
abs_count = studies_df['publication_type'].value_counts()
rel_count = studies_df['publication_type'].value_counts(normalize=True) * 100
distribution_df = pd.DataFrame({'Absolute Count': abs_count, 'Relative Count (%)': rel_count})
print(distribution_df)

# Distribution by problem (Diagnosis vs Severity).

In [None]:
# Group studies by 'doi' and aggregate 'problem' values into a list
problems_by_doi = studies_df.groupby('doi')['problem'].apply(list).reset_index()
problems_by_doi.head()
problems_by_doi["diagnosis"] = problems_by_doi["problem"].apply(lambda x: "Parkinson's disease diagnosis" in x)
problems_by_doi["severity"] = problems_by_doi["problem"].apply(lambda x: "Parkinson's disease severity stage classification" in x)
problems_by_doi.head()

In [None]:
only_diagnosis = problems_by_doi[problems_by_doi['diagnosis'] & ~problems_by_doi['severity']].shape[0]
print(f"Papers addressing only diagnosis: {only_diagnosis} ({only_diagnosis / problems_by_doi.shape[0] * 100:.2f}%)")
only_severity = problems_by_doi[~problems_by_doi['diagnosis'] & problems_by_doi['severity']].shape[0]
print(f"Papers addressing only severity: {only_severity} ({only_severity / problems_by_doi.shape[0] * 100:.2f}%)")
both = problems_by_doi[problems_by_doi['diagnosis'] & problems_by_doi['severity']].shape[0]
print(f"Papers addressing both diagnosis and severity: {both} ({both / problems_by_doi.shape[0] * 100:.2f}%)")

In [None]:
papers_df = studies_df.groupby('doi', as_index=False).agg({
    'journal': 'first',
    'authors': 'first',
    'year': 'first',
})
papers_df.head()

In [None]:
year_counts = papers_df['year'].value_counts().sort_index()

# Create bar plot
fig = go.Figure(
    data=[go.Bar(x=year_counts.index, y=year_counts.values, text=year_counts.values, textposition="outside")]
)

fig.update_layout(
    title="Paper Publication Trends by Year",
    xaxis_title="Year",
    yaxis_title="Number of Papers",
    template="plotly_white",
    bargap=0.2,  # space between bars
)

fig.show()

# Create seaborn bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=year_counts.index, y=year_counts.values, color="#4C72B0")

# Add value labels on bars
for i, v in enumerate(year_counts.values):
    plt.text(i, v + 0.5, str(v), ha='center', va='bottom')

plt.title("Paper Publication Trends by Year", fontsize=14, fontweight='bold')
plt.xlabel("Year", fontsize=12)
plt.ylabel("Number of Papers", fontsize=12)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_PATH, 'paper_publication_trends_by_year.svg'), dpi=600)
plt.show()

# Journal Analysis

In [None]:
journal_count = papers_df['journal'].value_counts()

In [None]:
print(f"Total unique journals: {journal_count.shape[0]}")
print(f"Journals that have published only one paper: {sum(journal_count == 1)}")

In [None]:

import textwrap

def break_long_names(name, max_len=22, max_lines=3):
    wrapped = textwrap.wrap(
        name,
        width=max_len,
        break_long_words=False,
        break_on_hyphens=False
    )

    # Limit number of lines
    if len(wrapped) > max_lines:
        wrapped = wrapped[:max_lines]
        wrapped[-1] += "‚Ä¶"

    return "\n".join(wrapped)

top10_journals = journal_count.head(10)

journal_labels = [break_long_names(j) for j in top10_journals.index]

fig = px.bar(
    x=top10_journals.values[::-1],
    y=journal_labels[::-1],   # reverse order
    orientation='h',
    title='Top 10 Journals by Number of Papers',
    labels={'x': 'Number of Papers', 'y': 'Journal'},
    color_discrete_sequence=['#4C72B0']
)

fig.update_layout(
    font=dict(size=14),
    title_font=dict(size=18),
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis=dict(
        showgrid=True,
        gridcolor='#E5E5E5'
    ),
    yaxis=dict(showgrid=False)
)

fig.update_traces(marker=dict(line=dict(width=1, color='black')))

In [None]:
from matplotlib.ticker import MaxNLocator

# Seaborn style
sns.set_style("whitegrid")

# Figure size (single-column journal size)
plt.figure(figsize=(9, 6))

# Calculate percentages
percentages = (top10_journals.values / len(papers_df)) * 100

# Bar plot
ax = sns.barplot(
    x=top10_journals.values,
    y=journal_labels,
    color="#4C72B0",
    edgecolor="black"
)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

# Labels and title
ax.set_title("Top 10 Journals by Number of Papers", fontsize=18, pad=12)
ax.set_xlabel("Number of Papers", fontsize=14)
ax.set_ylabel("Journal", fontsize=14)

# Ticks
ax.tick_params(axis='both', labelsize=11)

# Add value and percentage labels on bars
for i, (v, pct) in enumerate(zip(top10_journals.values, percentages)):
    ax.text(v - 0.5, i, f"{pct:.1f}%", va='center', ha='right', fontsize=12, fontweight='bold', color='white')

# Improve grid appearance
ax.grid(axis='x', color="#E5E5E5")
ax.grid(axis='y', visible=False)

# Tight layout for clean export
plt.tight_layout()

# ‚úÖ Save as SVG (vector format)
plt.savefig(os.path.join(PLOTS_PATH, 'top10_journals.svg'), dpi=600)

plt.show()

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# ================================
# Analysis: Papers published in top journals
# ================================

print("üìä PAPERS IN TOP JOURNALS ANALYSIS")
print("=" * 45)

# --- Robust journal cleaning (fixes NaN + empty strings)
journal_series = papers_df['journal'].replace('', pd.NA).dropna()

journal_counts_sorted = journal_series.value_counts()
total_papers_with_journal = len(journal_series)

# --- Top ranges
top_ranges = [10, 15, 20]

print(f"\nüì∞ Total papers with journal information: {total_papers_with_journal}")
print(f"üì∞ Total unique journals: {len(journal_counts_sorted)}")

# --- Summary statistics for Top 10, 15, 20
for top_n in top_ranges:
    if len(journal_counts_sorted) >= top_n:
        top_journals = journal_counts_sorted.head(top_n)
        papers_in_top = top_journals.sum()
        percentage = (papers_in_top / total_papers_with_journal) * 100

        print(f"\nüèÜ TOP {top_n} JOURNALS:")
        print(f"  ‚Ä¢ Papers published: {papers_in_top}/{total_papers_with_journal} ({percentage:.1f}%)")
        print(f"  ‚Ä¢ Average papers per journal: {papers_in_top / top_n:.1f}")

        if top_n == 10:
            print(f"  ‚Ä¢ Individual journal breakdown:")
            for i, (journal, count) in enumerate(top_journals.items(), 1):
                print(f"    {i:2d}. {journal}: {count} papers")
    else:
        print(f"\n‚ö†Ô∏è  Only {len(journal_counts_sorted)} journals available (less than {top_n})")

# ================================
# Efficient cumulative computation (Top 20)
# ================================

top20_counts = journal_counts_sorted.head(20)
cumulative_papers = top20_counts.cumsum().values.tolist()
cumulative_percentages = (100 * top20_counts.cumsum() / total_papers_with_journal).values.tolist()
journal_positions = list(range(1, len(top20_counts) + 1))

# ================================
# Plotly Visualization
# ================================

fig = go.Figure()

# Cumulative curve
fig.add_trace(go.Scatter(
    x=journal_positions,
    y=cumulative_percentages,
    mode='lines+markers',
    name='Cumulative Percentage',
    line=dict(width=3),
    marker=dict(size=8),
))

# Color-blind safe reference lines
colors = ['#4C72B0', '#DD8452', '#55A868']
labels = ['Top 10', 'Top 15', 'Top 20']

for top_n, color, label in zip(top_ranges, colors, labels):
    if top_n <= len(journal_positions):
        fig.add_vline(
            x=top_n,
            line_dash="dash",
            line_color=color,
            line_width=2
        )

        fig.add_annotation(
            x=top_n,
            y=cumulative_percentages[top_n - 1],
            text=f"{label}: {cumulative_percentages[top_n - 1]:.1f}%",
            showarrow=True,
            arrowhead=2,
            ax=25,
            ay=-35,
            font=dict(size=12),
        )

# Publication-grade layout
fig.update_layout(
    title="Cumulative Percentage of Papers Published by Top Journals",
    xaxis_title="Number of Top Journals",
    yaxis_title="Cumulative Percentage of Papers (%)",
    template="plotly_white",
    font=dict(size=14),
    title_font=dict(size=18),
    yaxis=dict(range=[0, 100]),
    height=500,
    showlegend=False
)

fig.show()

# ================================
# Summary Table (Clean Formatting)
# ================================

print(f"\nüìä SUMMARY TABLE:")
print("=" * 55)
print(f"{'Journals':<10} {'Papers':>10} {'Percentage':>12} {'Avg/Journal':>14}")
print("-" * 55)

for top_n in top_ranges:
    if len(journal_counts_sorted) >= top_n:
        papers_in_top = journal_counts_sorted.head(top_n).sum()
        percentage = (papers_in_top / total_papers_with_journal) * 100
        avg_per_journal = papers_in_top / top_n

        print(f"Top {top_n:<6} {papers_in_top:>10d} {percentage:>11.1f}% {avg_per_journal:>13.1f}")

# ================================
# Key Insights
# ================================

print(f"\nüîç Key Insights:")

if len(journal_counts_sorted) >= 20:
    top10_papers = journal_counts_sorted.head(10).sum()
    top20_papers = journal_counts_sorted.head(20).sum()
    concentration_ratio = (top10_papers / top20_papers) * 100

    print(f"  ‚Ä¢ Top 10 journals account for {concentration_ratio:.1f}% of top 20 journals' papers")

remaining_journals = len(journal_counts_sorted) - 20 if len(journal_counts_sorted) > 20 else 0

if remaining_journals > 0:
    remaining_papers = total_papers_with_journal - top20_counts.sum()
    print(f"  ‚Ä¢ Remaining {remaining_journals} journals published "
          f"{remaining_papers} papers ({remaining_papers / total_papers_with_journal * 100:.1f}%)")


In [None]:
# ================================
# Robust journal cleaning
# ================================

journal_series = papers_df['journal'].replace('', pd.NA).dropna()
journal_counts_sorted = journal_series.value_counts()
total_papers_with_journal = len(journal_series)

top_ranges = [10, 15, 20]

# ================================
# Efficient cumulative computation (Top 20)
# ================================

top20_counts = journal_counts_sorted.head(20)
cumulative_papers = top20_counts.cumsum()
cumulative_percentages = 100 * cumulative_papers / total_papers_with_journal
journal_positions = np.arange(1, len(top20_counts) + 1)

# ================================
# Seaborn / Matplotlib plot
# ================================

sns.set_style("whitegrid")

plt.figure(figsize=(9, 6))  # Single-column journal size
ax = plt.gca()

# Cumulative line
ax.plot(
    journal_positions,
    cumulative_percentages,
    marker='o',
    linewidth=2.5
)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

# Color-blind‚Äìsafe vertical reference lines
colors = ['#4C72B0', '#DD8452', '#55A868']
labels = ['Top 10', 'Top 15', 'Top 20']

for top_n, color, label in zip(top_ranges, colors, labels):
    if top_n <= len(journal_positions):
        y_val = cumulative_percentages.iloc[top_n - 1]

        ax.axvline(
            x=top_n,
            linestyle='--',
            linewidth=2
        )

        ax.annotate(
            f"{label}: {y_val:.1f}%",
            xy=(top_n, y_val),
            xytext=(top_n - 3, y_val + 6),
            arrowprops=dict(arrowstyle="->"),
            fontsize=11
        )

# Axis labels and title (journal safe: no emoji)
ax.set_title(
    "Cumulative Percentage of Papers Published by Top Journals",
    fontsize=18,
    pad=12
)
ax.set_xlabel("Number of Top Journals", fontsize=14)
ax.set_ylabel("Cumulative Percentage of Papers (%)", fontsize=14)

# Axis limits
ax.set_xlim(1, len(journal_positions))
ax.set_ylim(0, 100)

# Ticks
ax.tick_params(axis='both', labelsize=12)

# Grid refinement
ax.grid(axis='x', visible=False)
ax.grid(axis='y', color="#E5E5E5")

# Layout
plt.tight_layout()

# ================================
# Export (publication format)
# ================================

plt.savefig(os.path.join(PLOTS_PATH, "cumulative_top_journals.svg"), format="svg", dpi=600)
plt.savefig(os.path.join(PLOTS_PATH, "cumulative_top_journals.pdf"), format="pdf", dpi=600)

plt.show()


In [None]:
def get_affiliation(author):
    # Case 1: Missing or NaN
    if author is None or pd.isna(author):
        return None

    # Case 2: If it's a string, no structured affiliation available
    if isinstance(author, str):
        return None

    # Case 3: Must be a dictionary from here
    if not isinstance(author, dict):
        return None

    # Case 4: OpenAlex-style structure
    if "affiliations" in author and isinstance(author["affiliations"], list) and len(author["affiliations"]) > 0:
        first_aff = author["affiliations"][0]
        if isinstance(first_aff, dict):
            return first_aff.get("institution")

    # Case 5: Alternative structure
    if "affiliation" in author and isinstance(author["affiliation"], list) and len(author["affiliation"]) > 0:
        first_aff = author["affiliation"][0]
        if isinstance(first_aff, dict):
            return first_aff.get("institution")

    return None

def get_country(author):
    # Case 1: Missing or NaN
    if author is None or pd.isna(author):
        return None

    # Case 2: If it's a string, no structured affiliation available
    if isinstance(author, str):
        return None

    # Case 3: Must be a dictionary from here
    if not isinstance(author, dict):
        return None

    # Case 4: OpenAlex-style structure
    if "affiliations" in author and isinstance(author["affiliations"], list) and len(author["affiliations"]) > 0:
        first_aff = author["affiliations"][0]
        if isinstance(first_aff, dict):
            return first_aff.get("country")

    # Case 5: Alternative structure
    if "affiliation" in author and isinstance(author["affiliation"], list) and len(author["affiliation"]) > 0:
        first_aff = author["affiliation"][0]
        if isinstance(first_aff, dict):
            return first_aff.get("country")

    return None


In [None]:
authors_df = papers_df.explode('authors')
authors_df["author"] = authors_df["authors"].apply(lambda x: x['name'])
authors_df['fst_affiliation'] = authors_df["authors"].apply(lambda x: get_affiliation(x))
authors_df['fst_country'] = authors_df["authors"].apply(lambda x: get_country(x))
authors_df = authors_df.drop(columns=["authors"], inplace=False)
authors_df.head()

In [None]:
unique_authors = authors_df['author'].value_counts()
print(f"Total unique authors: {unique_authors.shape[0]}")
print(f"Mean authors per paper: {authors_df.shape[0] / papers_df.shape[0]:.2f}")
print(f"Authors with multiple papers: {sum(unique_authors > 1)}")
print(f"Percent of authors with multiple papers: {np.round(sum(unique_authors > 1) / unique_authors.shape[0] * 100, 2)}%")
unique_authors.head()

In [None]:
country_df = authors_df.groupby(by=['author', 'fst_country'], as_index=False).size()
country_df.head()

In [None]:
print(f"Total unique countries: {country_df['fst_country'].nunique()}")

In [None]:
top10_countries = country_df['fst_country'].value_counts().reset_index().head(10)
top10_countries['per_centage'] = np.round(top10_countries['count'] / country_df.shape[0] * 100, 2)
top10_countries.head()

In [None]:
# Seaborn style
sns.set_style("whitegrid")

# Figure size (single-column journal size)
plt.figure(figsize=(9, 6))

# Calculate percentages
percentages = top10_countries['per_centage']

# Bar plot
ax = sns.barplot(
    x=top10_countries['count'],
    y=top10_countries['fst_country'],
    color="#4C72B0",
    edgecolor="black"
)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

# Labels and title
ax.set_title("Top 10 Countries by Number of Authors", fontsize=18, pad=12)
ax.set_xlabel("Number of Authors", fontsize=14)
ax.set_ylabel("Country", fontsize=14)

# Ticks
ax.tick_params(axis='both', labelsize=11)

# Add value and percentage labels on bars (closer to the end of the bars)
for i, (v, pct) in enumerate(zip(top10_countries['count'], percentages)):
    ax.text(v - 0.5, i, f"{pct:.1f}%", va='center', ha='right', fontsize=12, fontweight='bold', color='white')

# Improve grid appearance
ax.grid(axis='x', color="#E5E5E5")
ax.grid(axis='y', visible=False)

# Tight layout for clean export
plt.tight_layout()

# ‚úÖ Save as SVG (vector format)
plt.savefig(os.path.join(PLOTS_PATH, 'top10_countries.svg'), dpi=600)

plt.show()

In [None]:
papers_df["doi_prefix"] = papers_df["doi"].apply(lambda x: x.split('/')[0] if isinstance(x, str) else None)
papers_df["doi_prefix"].head()

In [None]:
doi_df = papers_df["doi_prefix"].value_counts().reset_index()
doi_df.columns = ['doi_prefix', 'count']
top_doi_df = doi_df.head()
top_doi_df

In [None]:
doi_dict = {
    '10.1016': 'Elsevier',
    '10.1007': 'Springer',
    '10.1109': 'IEEE',
    '10.1038': 'Nature Publishing Group',
    '10.3389': 'Frontiers Media SA',
    '10.1155': 'Hindawi Publishing Corporation',
    '10.1080': 'Informa UK (Taylor & Francis)'
}
top_doi_df['publisher'] = top_doi_df['doi_prefix'].map(doi_dict)
top_doi_df = top_doi_df.drop(columns=['doi_prefix'], inplace=False)
top_doi_df = top_doi_df[['publisher', 'count']].copy()

In [None]:
top_doi_df['percentage'] = np.round(top_doi_df['count'] / papers_df.shape[0] * 100, 2)
top_doi_df