In [None]:
"""
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# WHO Dataset Exploration\n",
    "Explore and analyze the WHO disease dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Load data\n",
    "with open('../data/raw/who_dataset.json', 'r') as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "print(f\"Total documents: {len(data)}\")\n",
    "print(f\"\\nSample document keys: {list(data[0].keys())}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create DataFrame\n",
    "df = pd.DataFrame(data)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze text length distribution\n",
    "df['total_length'] = df.apply(\n",
    "    lambda row: sum(len(str(row[col])) for col in ['key_facts', 'overview', 'symptoms', 'treatment']),\n",
    "    axis=1\n",
    ")\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.hist(df['total_length'], bins=30, edgecolor='black')\n",
    "plt.xlabel('Total Text Length')\n",
    "plt.ylabel('Frequency')\n",
    "plt.title('Distribution of Document Lengths')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check field completeness\n",
    "fields = ['key_facts', 'overview', 'symptoms', 'causes', 'treatment', 'self_care']\n",
    "completeness = {}\n",
    "\n",
    "for field in fields:\n",
    "    non_empty = df[field].notna() & (df[field] != '')\n",
    "    completeness[field] = non_empty.sum() / len(df) * 100\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.bar(completeness.keys(), completeness.values())\n",
    "plt.xlabel('Field')\n",
    "plt.ylabel('Completeness (%)')\n",
    "plt.title('Field Completeness Analysis')\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
"""

In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# WHO Dataset Exploration\n",
    "## Comprehensive analysis of the WHO disease fact sheets dataset\n",
    "\n",
    "This notebook explores:\n",
    "- Dataset statistics and completeness\n",
    "- Text length distributions\n",
    "- Field coverage analysis\n",
    "- Sample data inspection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('..')\n",
    "\n",
    "import json\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from pathlib import Path\n",
    "from collections import Counter\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Set style\n",
    "sns.set_style('whitegrid')\n",
    "plt.rcParams['figure.figsize'] = (12, 6)\n",
    "\n",
    "print(\"✓ Imports successful\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load WHO dataset\n",
    "data_path = Path('../data/raw/who_dataset.json')\n",
    "\n",
    "if not data_path.exists():\n",
    "    print(\"❌ Dataset not found! Please run scraper first:\")\n",
    "    print(\"   python scripts/scrape_who_data.py\")\n",
    "else:\n",
    "    with open(data_path, 'r', encoding='utf-8') as f:\n",
    "        data = json.load(f)\n",
    "    \n",
    "    print(f\"✓ Loaded {len(data)} documents\")\n",
    "    print(f\"✓ Sample keys: {list(data[0].keys())}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Basic Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert to DataFrame\n",
    "df = pd.DataFrame(data)\n",
    "\n",
    "print(\"Dataset Shape:\", df.shape)\n",
    "print(\"\\nColumns:\")\n",
    "for col in df.columns:\n",
    "    print(f\"  - {col}\")\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display sample diseases\n",
    "print(\"Sample Diseases:\")\n",
    "for i, name in enumerate(df['name'].head(10), 1):\n",
    "    print(f\"{i:2d}. {name}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Field Completeness Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate completeness for each field\n",
    "fields = ['key_facts', 'overview', 'symptoms', 'causes', 'treatment', 'self_care', 'impact', 'who_response', 'reference']\n",
    "\n",
    "completeness = {}\n",
    "for field in fields:\n",
    "    non_empty = df[field].notna() & (df[field].str.strip() != '')\n",
    "    completeness[field] = (non_empty.sum() / len(df)) * 100\n",
    "\n",
    "# Create DataFrame for better visualization\n",
    "completeness_df = pd.DataFrame({\n",
    "    'Field': list(completeness.keys()),\n",
    "    'Completeness (%)': list(completeness.values())\n",
    "}).sort_values('Completeness (%)', ascending=False)\n",
    "\n",
    "print(completeness_df.to_string(index=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize completeness\n",
    "plt.figure(figsize=(12, 6))\n",
    "colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(completeness_df)))\n",
    "bars = plt.bar(completeness_df['Field'], completeness_df['Completeness (%)'], color=colors)\n",
    "\n",
    "plt.xlabel('Field', fontsize=12)\n",
    "plt.ylabel('Completeness (%)', fontsize=12)\n",
    "plt.title('Field Completeness Analysis', fontsize=14, fontweight='bold')\n",
    "plt.xticks(rotation=45, ha='right')\n",
    "plt.ylim(0, 105)\n",
    "\n",
    "# Add percentage labels on bars\n",
    "for bar in bars:\n",
    "    height = bar.get_height()\n",
    "    plt.text(bar.get_x() + bar.get_width()/2., height,\n",
    "             f'{height:.1f}%',\n",
    "             ha='center', va='bottom', fontsize=9)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Text Length Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate text lengths\n",
    "for field in fields:\n",
    "    df[f'{field}_length'] = df[field].fillna('').str.len()\n",
    "\n",
    "# Total content length\n",
    "df['total_length'] = df[[f'{field}_length' for field in fields]].sum(axis=1)\n",
    "\n",
    "# Word count\n",
    "df['word_count'] = df.apply(\n",
    "    lambda row: sum(len(str(row[field]).split()) for field in fields),\n",
    "    axis=1\n",
    ")\n",
    "\n",
    "print(\"Text Length Statistics:\")\n",
    "print(df[['total_length', 'word_count']].describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Distribution of document lengths\n",
    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
    "\n",
    "# Character count distribution\n",
    "axes[0].hist(df['total_length'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)\n",
    "axes[0].axvline(df['total_length'].median(), color='red', linestyle='--', label=f'Median: {df[\"total_length\"].median():.0f}')\n",
    "axes[0].set_xlabel('Total Characters', fontsize=11)\n",
    "axes[0].set_ylabel('Frequency', fontsize=11)\n",
    "axes[0].set_title('Distribution of Document Lengths (Characters)', fontsize=12, fontweight='bold')\n",
    "axes[0].legend()\n",
    "\n",
    "# Word count distribution\n",
    "axes[1].hist(df['word_count'], bins=30, color='lightcoral', edgecolor='black', alpha=0.7)\n",
    "axes[1].axvline(df['word_count'].median(), color='red', linestyle='--', label=f'Median: {df[\"word_count\"].median():.0f}')\n",
    "axes[1].set_xlabel('Total Words', fontsize=11)\n",
    "axes[1].set_ylabel('Frequency', fontsize=11)\n",
    "axes[1].set_title('Distribution of Document Lengths (Words)', fontsize=12, fontweight='bold')\n",
    "axes[1].legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Field-wise length distribution\n",
    "length_cols = [f'{field}_length' for field in fields]\n",
    "field_lengths = df[length_cols].mean().sort_values(ascending=False)\n",
    "field_lengths.index = [col.replace('_length', '').replace('_', ' ').title() for col in field_lengths.index]\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "colors = plt.cm.plasma(np.linspace(0.2, 0.8, len(field_lengths)))\n",
    "bars = plt.barh(field_lengths.index, field_lengths.values, color=colors)\n",
    "\n",
    "plt.xlabel('Average Character Count', fontsize=12)\n",
    "plt.ylabel('Field', fontsize=12)\n",
    "plt.title('Average Text Length by Field', fontsize=14, fontweight='bold')\n",
    "\n",
    "# Add value labels\n",
    "for i, bar in enumerate(bars):\n",
    "    width = bar.get_width()\n",
    "    plt.text(width, bar.get_y() + bar.get_height()/2.,\n",
    "             f'{width:.0f}',\n",
    "             ha='left', va='center', fontsize=9, fontweight='bold')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Content Quality Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Documents with complete information (all major fields filled)\n",
    "major_fields = ['key_facts', 'overview', 'symptoms', 'treatment']\n",
    "df['is_complete'] = df[major_fields].notna().all(axis=1) & \\\n",
    "                    (df[major_fields] != '').all(axis=1)\n",
    "\n",
    "complete_count = df['is_complete'].sum()\n",
    "complete_pct = (complete_count / len(df)) * 100\n",
    "\n",
    "print(f\"Documents with complete major fields: {complete_count}/{len(df)} ({complete_pct:.1f}%)\")\n",
    "\n",
    "# Visualize\n",
    "fig, ax = plt.subplots(figsize=(8, 8))\n",
    "sizes = [complete_count, len(df) - complete_count]\n",
    "labels = [f'Complete\\n({complete_count})', f'Incomplete\\n({len(df) - complete_count})']\n",
    "colors = ['#2ecc71', '#e74c3c']\n",
    "explode = (0.05, 0)\n",
    "\n",
    "ax.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',\n",
    "       startangle=90, explode=explode, textprops={'fontsize': 12})\n",
    "ax.set_title('Document Completeness (Major Fields)', fontsize=14, fontweight='bold')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Sample Document Inspection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display a complete sample document\n",
    "if complete_count > 0:\n",
    "    sample = df[df['is_complete']].iloc[0]\n",
    "    \n",
    "    print(\"=\"*80)\n",
    "    print(f\"SAMPLE DOCUMENT: {sample['name']}\")\n",
    "    print(\"=\"*80)\n",
    "    print(f\"\\nURL: {sample['url']}\")\n",
    "    print(f\"\\nTotal Length: {sample['total_length']} characters\")\n",
    "    print(f\"Word Count: {sample['word_count']} words\")\n",
    "    \n",
    "    for field in fields:\n",
    "        content = sample[field]\n",
    "        if content and content.strip():\n",
    "            print(f\"\\n{'='*80}\")\n",
    "            print(f\"{field.upper().replace('_', ' ')}:\")\n",
    "            print(f\"{'='*80}\")\n",
    "            # Truncate if too long\n",
    "            if len(content) > 500:\n",
    "                print(content[:500] + \"...\")\n",
    "                print(f\"\\n[Truncated. Full length: {len(content)} characters]\")\n",
    "            else:\n",
    "                print(content)\n",
    "else:\n",
    "    print(\"No complete documents found in dataset\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Top Diseases by Content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Top 10 diseases by content length\n",
    "top_diseases = df.nlargest(10, 'total_length')[['name', 'total_length', 'word_count']]\n",
    "top_diseases.index = range(1, len(top_diseases) + 1)\n",
    "\n",
    "print(\"Top 10 Diseases by Content Length:\")\n",
    "print(top_diseases.to_string())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Summary Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n\" + \"=\"*80)\n",
    "print(\"DATASET SUMMARY\")\n",
    "print(\"=\"*80)\n",
    "print(f\"Total Documents: {len(df)}\")\n",
    "print(f\"Complete Documents: {complete_count} ({complete_pct:.1f}%)\")\n",
    "print(f\"\\nAverage Document Length: {df['total_length'].mean():.0f} characters\")\n",
    "print(f\"Average Word Count: {df['word_count'].mean():.0f} words\")\n",
    "print(f\"\\nField Completeness (Top 3):\")\n",
    "for i, row in completeness_df.head(3).iterrows():\n",
    "    print(f\"  {row['Field']:15s}: {row['Completeness (%)']:5.1f}%\")\n",
    "print(\"\\n\" + \"=\"*80)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Export Summary\n",
    "Save exploration results for reference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create summary dictionary\n",
    "summary = {\n",
    "    'total_documents': len(df),\n",
    "    'complete_documents': int(complete_count),\n",
    "    'avg_length_chars': float(df['total_length'].mean()),\n",
    "    'avg_word_count': float(df['word_count'].mean()),\n",
    "    'field_completeness': completeness,\n",
    "    'top_diseases': df.nlargest(10, 'total_length')['name'].tolist()\n",
    "}\n",
    "\n",
    "# Save to JSON\n",
    "output_path = Path('../data/processed/exploration_summary.json')\n",
    "output_path.parent.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "with open(output_path, 'w') as f:\n",
    "    json.dump(summary, f, indent=2)\n",
    "\n",
    "print(f\"✓ Summary saved to {output_path}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}