In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CORD-19 Dataset Exploration\n",
    "## COVID-19 Research Papers Analysis\n",
    "\n",
    "This notebook provides a comprehensive exploration of the CORD-19 dataset metadata."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import sys\n",
    "import os\n",
    "sys.path.append('../src')\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import warnings\n",
    "\n",
    "from data_loader import DataLoader\n",
    "from data_cleaner import DataCleaner\n",
    "from analyzer import DataAnalyzer\n",
    "from visualizer import DataVisualizer\n",
    "\n",
    "warnings.filterwarnings('ignore')\n",
    "plt.rcParams['figure.figsize'] = (12, 6)\n",
    "sns.set_style(\"whitegrid\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Part 1: Data Loading and Basic Exploration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize data loader\n",
    "loader = DataLoader('../data/metadata.csv')\n",
    "\n",
    "# Load data (use sample_size for large datasets)\n",
    "df = loader.load_data(sample_size=50000)  # Adjust or remove sample_size as needed\n",
    "\n",
    "print(f\"Dataset loaded successfully!\")\n",
    "print(f\"Shape: {df.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display first few rows\n",
    "loader.display_sample(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Basic exploration\n",
    "exploration_results = loader.basic_exploration()\n",
    "\n",
    "print(\"Dataset Information:\")\n",
    "print(f\"Rows: {exploration_results['shape'][0]:,}\")\n",
    "print(f\"Columns: {exploration_results['shape'][1]}\")\n",
    "print(f\"Memory Usage: {exploration_results['memory_usage']:.2f} MB\")\n",
    "print(f\"\\nColumns: {exploration_results['columns'][:10]}...\")  # Show first 10 columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Column information\n",
    "column_info = loader.get_column_info()\n",
    "print(\"Column Information (showing columns with <50% missing):\")\n",
    "useful_columns = column_info[column_info['Null Percentage'] < 50]\n",
    "print(useful_columns.to_string(index=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Part 2: Data Cleaning and Preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize data cleaner\n",
    "cleaner = DataCleaner(df)\n",
    "\n",
    "# Get cleaned data\n",
    "cleaned_df = cleaner.get_cleaned_data()\n",
    "\n",
    "print(f\"Original shape: {df.shape}\")\n",
    "print(f\"Cleaned shape: {cleaned_df.shape}\")\n",
    "print(f\"\\nRemaining columns: {list(cleaned_df.columns)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check data types and missing values after cleaning\n",
    "print(\"Data Info After Cleaning:\")\n",
    "print(cleaned_df.info())\n",
    "print(\"\\nMissing Values:\")\n",
    "print(cleaned_df.isnull().sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Part 3: Data Analysis and Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize analyzer and visualizer\n",
    "analyzer = DataAnalyzer(cleaned_df)\n",
    "visualizer = DataVisualizer()\n",
    "\n",
    "# Get basic statistics\n",
    "basic_stats = analyzer.get_basic_statistics()\n",
    "print(\"Basic Statistics:\")\n",
    "for key, value in basic_stats.items():\n",
    "    print(f\"{key.replace('_', ' ').title()}: {value}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Summary dashboard\n",
    "fig = visualizer.create_summary_dashboard(basic_stats)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Publications by year analysis\n",
    "yearly_data = analyzer.analyze_publications_by_year()\n",
    "print(\"Publications by Year:\")\n",
    "print(yearly_data.to_string(index=False))\n",
    "\n",
    "# Visualization\n",
    "fig = visualizer.plot_publications_by_year(yearly_data)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Top journals analysis\n",
    "journal_data = analyzer.get_top_journals(20)\n",
    "if not journal_data.empty:\n",
    "    print(\"Top 10 Journals:\")\n",
    "    print(journal_data.head(10).to_string(index=False))\n",
    "    \n",
    "    # Visualization\n",
    "    fig = visualizer.plot_top_journals(journal_data)\n",
    "    plt.show()\n",
    "else:\n",
    "    print(\"No journal data available\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Word frequency analysis\n",
    "word_data = analyzer.analyze_title_words(30)\n",
    "if not word_data.empty:\n",
    "    print(\"Most Frequent Words in Titles:\")\n",
    "    print(word_data.head(15).to_string(index=False))\n",
    "    \n",
    "    # Word cloud visualization\n",
    "    fig = visualizer.create_word_cloud(word_data)\n",
    "    plt.show()\n",
    "else:\n",
    "    print(\"No word data available\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Source distribution analysis\n",
    "source_data = analyzer.analyze_source_distribution()\n",
    "if not source_data.empty:\n",
    "    print(\"Source Distribution:\")\n",
    "    print(source_data.head(10).to_string(index=False))\n",
    "    \n",
    "    # Visualization\n",
    "    fig = visualizer.plot_source_distribution(source_data)\n",
    "    plt.show()\n",
    "else:\n",
    "    print(\"No source data available\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Monthly trends analysis\n",
    "monthly_data = analyzer.analyze_monthly_trends()\n",
    "if not monthly_data.empty:\n",
    "    print(\"Monthly Publication Trends:\")\n",
    "    print(monthly_data.to_string(index=False))\n",
    "    \n",
    "    # Visualization\n",
    "    fig = visualizer.plot_monthly_trends(monthly_data)\n",
    "    plt.show()\n",
    "else:\n",
    "    print(\"No monthly data available\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Part 4: Data Export for Streamlit App"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save cleaned data for Streamlit app\n",
    "cleaned_df.to_csv('../data/cleaned_metadata.csv', index=False)\n",
    "print(\"Cleaned data saved to '../data/cleaned_metadata.csv'\")\n",
    "\n",
    "# Save analysis results\n",
    "yearly_data.to_csv('../results/yearly_publications.csv', index=False)\n",
    "if not journal_data.empty:\n",
    "    journal_data.to_csv('../results/top_journals.csv', index=False)\n",
    "if not word_data.empty:\n",
    "    word_data.to_csv('../results/word_frequency.csv', index=False)\n",
    "\n",
    "print(\"Analysis results saved to results folder\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Key Findings Summary\n",
    "\n",
    "Based on the analysis above, summarize your key findings here:\n",
    "\n",
    "1. **Publication Trends**: [Add your observations]\n",
    "2. **Top Journals**: [Add your observations]\n",
    "3. **Research Focus**: [Based on word frequency analysis]\n",
    "4. **Data Sources**: [Based on source distribution]\n",
    "5. **Seasonal Patterns**: [Based on monthly analysis]\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}