
# CORD-19 Dataset Analysis: COVID-19 Research Publications

## Overview
This notebook analyzes the CORD-19 dataset metadata to understand patterns in COVID-19 research publications. The analysis covers:

- **Data Loading & Exploration**: Understanding the dataset structure and quality
- **Publication Trends**: Temporal patterns in research output  
- **Journal Analysis**: Top publishing venues and distribution
- **Text Mining**: Word frequency and content analysis
- **Visualizations**: Interactive charts and statistical summaries

## Dataset Information
- **Source**: [CORD-19 Dataset](https://www.semanticscholar.org/cord19/download)
- **Content**: Metadata for COVID-19 research papers
- **Size**: ~500MB+ (metadata only)
- **Columns**: Title, abstract, authors, journal, publication date, DOI, etc.

## Requirements
```bash
pip install pandas numpy matplotlib seaborn plotly wordcloud
```

---


In [18]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Text analysis
from wordcloud import WordCloud
import re
from collections import Counter

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)

print("üìö Libraries loaded successfully!")
print(f"üìÖ Analysis date: {datetime.now().strftime('%Y-%m-%d %H:%M')}")


üìö Libraries loaded successfully!
üìÖ Analysis date: 2025-09-19 17:20


---

# Part 1: Data Loading and Basic Exploration

In this section, we:
1. Load the CORD-19 metadata CSV file
2. Examine the basic structure and dimensions
3. Identify data types and column information
4. Assess data quality and completeness

**Expected Time**: 2-3 hours (including data download)

In [None]:
def load_cord19_data(file_path="metadata.csv", sample_size=None):    
    print("üì• Loading CORD-19 metadata...")
    print(f"   File: {file_path}")
    
    try:
        if sample_size:
            print(f"   Sampling: {sample_size:,} random papers")
            # Read in chunks for memory efficiency
            chunk_list = []
            chunk_size = 10000
            
            for i, chunk in enumerate(pd.read_csv(file_path, chunksize=chunk_size)):
                chunk_list.append(chunk)
                if len(pd.concat(chunk_list)) >= sample_size * 1.2:  # Get extra for sampling
                    break
            
            df = pd.concat(chunk_list, ignore_index=True)
            df = df.sample(n=min(sample_size, len(df)), random_state=42)
        else:
            print("   Loading full dataset...")
            df = pd.read_csv(file_path)
            
        print(f"‚úÖ Dataset loaded successfully!")
        print(f"üìä Final shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
        print(f"üíæ Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
        
        return df
        
    except FileNotFoundError:
        print("‚ùå File not found!")
        print("üì• Please download metadata.csv from: https://www.semanticscholar.org/cord19/download")
        return None
    except Exception as e:
        print(f"‚ùå Error loading data: {e}")
        return None
    
# df = load_cord19_data("metadata.csv")  # Full dataset
df = load_cord19_data("metadata.csv", sample_size=10000)  # Sample for testing

## Basic Dataset Exploration

Let's examine the structure and content of our dataset:

In [None]:
if df is not None:
    print("üîç DATASET OVERVIEW")
    print("=" * 60)
    print(f"Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
    print(f"Size in memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"Data types: {df.dtypes.value_counts().to_dict()}")
    
    print("\nüìã COLUMN INFORMATION:")
    print("-" * 80)
    
    for i, col in enumerate(df.columns, 1):
        dtype = str(df[col].dtype)
        non_null = df[col].count()
        null_count = df[col].isnull().sum()
        null_pct = (null_count / len(df)) * 100
        
        # Sample values
        sample_vals = df[col].dropna().head(2).values
        sample_str = str(sample_vals).replace('\n', ' ')[:50] + '...' if len(str(sample_vals)) > 50 else str(sample_vals)
        
        print(f"{i:2d}. {col:<20} | {dtype:<12} | {non_null:>7,} non-null | {null_pct:5.1f}% missing | {sample_str}")
    
    print(f"\nüìÑ FIRST 3 ROWS:")
    display(df.head(3))
    
    print(f"\nüìä BASIC STATISTICS:")
    display(df.describe(include='all').round(2))

## Missing Data Analysis

Understanding data completeness is crucial for reliable analysis. Let's examine missing data patterns:

In [None]:
def analyze_missing_data(df):
    """Comprehensive missing data analysis with visualization"""
    
    print("üîç MISSING DATA ANALYSIS")
    print("=" * 50)
    
    # Calculate missing data statistics
    missing_data = df.isnull().sum().sort_values(ascending=False)
    missing_pct = (missing_data / len(df)) * 100
    
    # Create summary DataFrame
    missing_df = pd.DataFrame({
        'Column': missing_data.index,
        'Missing_Count': missing_data.values,
        'Missing_Percent': missing_pct.values
    })
    
    # Show only columns with missing data
    missing_with_nulls = missing_df[missing_df['Missing_Count'] > 0]
    
    if len(missing_with_nulls) > 0:
        print("üìä Columns with missing data:")
        for _, row in missing_with_nulls.head(10).iterrows():
            print(f"   {row['Column']:<25}: {row['Missing_Count']:>7,} ({row['Missing_Percent']:5.1f}%)")
    else:
        print("‚úÖ No missing data found!")
    
    # Visualize missing data
    if len(missing_with_nulls) > 0:
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        
        # Missing data bar chart
        top_missing = missing_with_nulls.head(10)
        axes[0].barh(top_missing['Column'], top_missing['Missing_Percent'], color='coral', alpha=0.7)
        axes[0].set_xlabel('Missing Data Percentage (%)')
        axes[0].set_title('Missing Data by Column (Top 10)')
        axes[0].grid(axis='x', alpha=0.3)
        
        # Missing data heatmap for key columns
        key_columns = ['title', 'abstract', 'authors', 'journal', 'publish_time', 'doi']
        available_key_cols = [col for col in key_columns if col in df.columns]
        
        if available_key_cols and len(available_key_cols) > 1:
            missing_matrix = df[available_key_cols].head(1000).isnull()  # Sample for visualization
            sns.heatmap(missing_matrix, cbar=True, yticklabels=False, cmap='viridis', ax=axes[1])
            axes[1].set_title('Missing Data Heatmap (Key Columns, Sample)')
            axes[1].set_xlabel('Columns')
        else:
            axes[1].text(0.5, 0.5, 'Insufficient columns\nfor heatmap', 
                        ha='center', va='center', transform=axes[1].transAxes, fontsize=14)
            axes[1].set_title('Missing Data Heatmap')
        
        plt.tight_layout()
        plt.show()
    
    return missing_df

if df is not None:
    missing_analysis = analyze_missing_data(df)

 ## Comprehensive data cleaning for CORD-19 metadata
    
### Cleaning steps:
    1. Remove papers without titles (essential for analysis)
    2. Parse and validate publication dates
    3. Create derived features (word counts, year extraction)
    4. Handle journal name standardization
    5. Filter for reasonable publication years

In [None]:
def clean_cord19_data(df):    
    print("üßπ DATA CLEANING PIPELINE")
    print("=" * 40)
    
    df_clean = df.copy()
    original_size = len(df_clean)
    
    print(f"üìä Starting with: {original_size:,} papers")
    
    # Step 1: Remove papers without titles
    if 'title' in df_clean.columns:
        before_title = len(df_clean)
        df_clean = df_clean.dropna(subset=['title'])
        df_clean = df_clean[df_clean['title'].str.strip() != '']
        removed_title = before_title - len(df_clean)
        print(f"üóëÔ∏è  Removed {removed_title:,} papers without titles")
    
    # Step 2: Handle publication dates
    if 'publish_time' in df_clean.columns:
        print("üìÖ Processing publication dates...")
        
        # Convert to datetime
        df_clean['publish_time'] = pd.to_datetime(df_clean['publish_time'], errors='coerce')
        
        # Extract year
        df_clean['publication_year'] = df_clean['publish_time'].dt.year
        
        # Filter for reasonable years (1990-2024)
        before_year_filter = len(df_clean)
        valid_years = (df_clean['publication_year'] >= 1990) & (df_clean['publication_year'] <= 2024)
        df_clean = df_clean[valid_years | df_clean['publication_year'].isnull()]
        removed_years = before_year_filter - len(df_clean)
        print(f"üóëÔ∏è  Removed {removed_years:,} papers with invalid years")
        
        # Show year distribution
        if 'publication_year' in df_clean.columns:
            year_range = f"{df_clean['publication_year'].min():.0f} - {df_clean['publication_year'].max():.0f}"
            print(f"üìÖ Publication year range: {year_range}")
    
    # Step 3: Create derived features
    print("üîß Creating derived features...")
    
    # Abstract word count
    if 'abstract' in df_clean.columns:
        df_clean['abstract_word_count'] = df_clean['abstract'].astype(str).apply(
            lambda x: len(x.split()) if pd.notna(x) and x.lower() not in ['nan', 'none', ''] else 0
        )
        avg_abstract_len = df_clean['abstract_word_count'].mean()
        print(f"üìù Average abstract length: {avg_abstract_len:.0f} words")
    
    # Title characteristics
    if 'title' in df_clean.columns:
        df_clean['title_length'] = df_clean['title'].astype(str).apply(len)
        df_clean['title_word_count'] = df_clean['title'].astype(str).apply(lambda x: len(x.split()))
        avg_title_len = df_clean['title_length'].mean()
        print(f"üìù Average title length: {avg_title_len:.0f} characters")
    
    # Step 4: Clean journal names
    if 'journal' in df_clean.columns:
        # Fill missing journals
        df_clean['journal'] = df_clean['journal'].fillna('Unknown Journal')
        
        # Basic journal name cleaning
        df_clean['journal'] = df_clean['journal'].str.strip()
        df_clean['journal'] = df_clean['journal'].str.title()
        
        unique_journals = df_clean['journal'].nunique()
        print(f"üì∞ Found {unique_journals:,} unique journals")
    
    # Step 5: Handle author information
    if 'authors' in df_clean.columns:
        df_clean['has_authors'] = df_clean['authors'].notna()
        df_clean['author_count'] = df_clean['authors'].astype(str).apply(
            lambda x: len(x.split(';')) if pd.notna(x) and x.lower() not in ['nan', 'none', ''] else 0
        )
        papers_with_authors = df_clean['has_authors'].sum()
        print(f"üë• Papers with author info: {papers_with_authors:,} ({papers_with_authors/len(df_clean)*100:.1f}%)")
    
    final_size = len(df_clean)
    removed_total = original_size - final_size
    
    print(f"\n‚úÖ Cleaning completed!")
    print(f"üìä Final dataset: {final_size:,} papers ({removed_total:,} removed, {removed_total/original_size*100:.1f}%)")
    
    return df_clean

# Apply cleaning
if df is not None:
    df_clean = clean_cord19_data(df)
    print(f"\nüéØ Ready for analysis with {len(df_clean):,} papers!")


## Data Quality Assessment

Let's verify the quality of our cleaned dataset:

In [None]:
def assess_data_quality(df):    
    print("üèÜ DATA QUALITY ASSESSMENT")
    print("=" * 40)
    
    # Basic statistics
    total_papers = len(df)
    print(f"üìÑ Total papers: {total_papers:,}")
    
    # Completeness scores for key fields
    key_fields = ['title', 'abstract', 'authors', 'journal', 'publication_year']
    
    print(f"\nüìä Completeness Assessment:")
    for field in key_fields:
        if field in df.columns:
            complete_count = df[field].notna().sum()
            completeness = (complete_count / total_papers) * 100
            status = "‚úÖ" if completeness > 90 else "‚ö†Ô∏è" if completeness > 70 else "‚ùå"
            print(f"   {status} {field:<18}: {completeness:5.1f}% complete ({complete_count:,}/{total_papers:,})")
    
    # Data distribution insights
    if 'publication_year' in df.columns:
        year_dist = df['publication_year'].value_counts().head(5)
        print(f"\nüìÖ Top publication years:")
        for year, count in year_dist.items():
            print(f"   üìà {year:.0f}: {count:,} papers")
    
    if 'journal' in df.columns:
        journal_dist = df['journal'].value_counts().head(3)
        print(f"\nüì∞ Top journals:")
        for journal, count in journal_dist.items():
            print(f"   üìñ {journal}: {count:,} papers")
    
    # Text quality metrics
    if 'abstract_word_count' in df.columns:
        avg_abstract = df['abstract_word_count'].mean()
        median_abstract = df['abstract_word_count'].median()
        print(f"\nüìù Abstract statistics:")
        print(f"   Average length: {avg_abstract:.0f} words")
        print(f"   Median length: {median_abstract:.0f} words")
        
        # Papers with substantial abstracts (>50 words)
        substantial_abstracts = (df['abstract_word_count'] > 50).sum()
        print(f"   Papers with >50 words: {substantial_abstracts:,} ({substantial_abstracts/total_papers*100:.1f}%)")

if df_clean is not None:
    assess_data_quality(df_clean)

# Part 3: Data Analysis and Visualization

Now for the main analysis! We'll explore:

1. **Publication trends over time** - How has COVID-19 research evolved?
2. **Journal analysis** - Which venues publish the most research?  
3. **Text analysis** - What topics and terms are most common?
4. **Statistical summaries** - Key insights and patterns

In [None]:
def analyze_publication_trends(df):    
    print("üìà PUBLICATION TRENDS ANALYSIS")
    print("=" * 40)
    
    if 'publication_year' not in df.columns:
        print("‚ùå No publication year data available")
        return None
    
    # Calculate yearly statistics
    yearly_counts = df['publication_year'].value_counts().sort_index()
    
    print(f"üìä Publication timeline:")
    print(f"   Years covered: {yearly_counts.index.min():.0f} - {yearly_counts.index.max():.0f}")
    print(f"   Peak year: {yearly_counts.idxmax():.0f} ({yearly_counts.max():,} papers)")
    print(f"   Total years: {len(yearly_counts)}")
    
    # Recent trends (2020 onwards - COVID era)
    covid_era = yearly_counts[yearly_counts.index >= 2020] if yearly_counts.index.max() >= 2020 else pd.Series()
    if len(covid_era) > 0:
        print(f"\nü¶† COVID-19 era trends (2020+):")
        for year, count in covid_era.items():
            print(f"   {year:.0f}: {count:,} papers")
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Full timeline
    axes[0].bar(yearly_counts.index, yearly_counts.values, color='steelblue', alpha=0.7, width=0.8)
    axes[0].set_title('COVID-19 Research Publications by Year', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Publication Year')
    axes[0].set_ylabel('Number of Papers')
    axes[0].grid(axis='y', alpha=0.3)
    axes[0].tick_params(axis='x', rotation=45)
    
    # Add annotations for peak years
    if len(yearly_counts) > 0:
        peak_year = yearly_counts.idxmax()
        peak_count = yearly_counts.max()
        axes[0].annotate(f'Peak: {peak_count:,}', 
                        xy=(peak_year, peak_count), 
                        xytext=(peak_year, peak_count * 1.1),
                        arrowprops=dict(arrowstyle='->', color='red'),
                        ha='center', fontweight='bold')
    
    # COVID era focus (if applicable)
    if len(covid_era) > 1:
        axes[1].plot(covid_era.index, covid_era.values, marker='o', linewidth=3, markersize=8, color='darkred')
        axes[1].fill_between(covid_era.index, covid_era.values, alpha=0.3, color='darkred')
        axes[1].set_title('COVID-19 Era Research Trend', fontsize=14, fontweight='bold')
        axes[1].set_xlabel('Year')
        axes[1].set_ylabel('Papers Published')
        axes[1].grid(True, alpha=0.3)
    else:
        # Show recent 5 years if no COVID era data
        recent_years = yearly_counts.tail(min(5, len(yearly_counts)))
        axes[1].bar(recent_years.index, recent_years.values, color='darkgreen', alpha=0.7)
        axes[1].set_title('Recent Publication Trend', fontsize=14, fontweight='bold')
        axes[1].set_xlabel('Year')
        axes[1].set_ylabel('Papers Published')
        axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return yearly_counts

if df_clean is not None:
    yearly_analysis = analyze_publication_trends(df_clean)

## Journal Analysis

Let's identify the top journals publishing COVID-19 research and understand the distribution:

In [None]:
def analyze_journals(df, top_n=15):
    """Analyze journal publication patterns"""
    
    print(f"üì∞ JOURNAL ANALYSIS (TOP {top_n})")
    print("=" * 50)
    
    if 'journal' not in df.columns:
        print("‚ùå No journal data available")
        return None
    
    # Calculate journal statistics
    journal_counts = df['journal'].value_counts()
    total_papers = len(df)
    unique_journals = len(journal_counts)
    
    print(f"üìä Journal Overview:")
    print(f"   Total unique journals: {unique_journals:,}")
    print(f"   Papers per journal (avg): {total_papers/unique_journals:.1f}")
    
    # Top journals
    top_journals = journal_counts.head(top_n)
    print(f"\nüèÜ Top {top_n} journals by publication count:")
    
    for i, (journal, count) in enumerate(top_journals.items(), 1):
        percentage = (count / total_papers) * 100
        print(f"{i:2d}. {journal:<50} {count:>6,} ({percentage:5.1f}%)")
    
    # Concentration analysis
    top_10_share = (journal_counts.head(10).sum() / total_papers) * 100
    top_50_share = (journal_counts.head(50).sum() / total_papers) * 100
    
    print(f"\nüìà Publication concentration:")
    print(f"   Top 10 journals: {top_10_share:.1f}% of all papers")
    print(f"   Top 50 journals: {top_50_share:.1f}% of all papers")
    
    # Visualization
    fig, axes = plt.subplots(2, 1, figsize=(16, 12))
    
    # Horizontal bar chart of top journals
    axes[0].barh(range(len(top_journals)), top_journals.values, color='coral', alpha=0.7)
    axes[0].set_yticks(range(len(top_journals)))
    axes[0].set_yticklabels([j[:50] + '...' if len(j) > 50 else j for j in top_journals.index])
    axes[0].set_xlabel('Number of Papers')
    axes[0].set_title(f'Top {top_n} Journals Publishing COVID-19 Research', fontsize=14, fontweight='bold')
    axes[0].grid(axis='x', alpha=0.3)
    
    # Add value labels
    max_count = top_journals.values.max()
    for i, (journal, count) in enumerate(top_journals.items()):
        axes[0].text(count + max_count * 0.01, i, f'{count:,}', 
                    va='center', fontweight='bold')
    
    # Distribution analysis - log scale
    journal_sizes = journal_counts.values
    axes[1].hist(journal_sizes, bins=50, color='lightblue', alpha=0.7, edgecolor='black')
    axes[1].set_xlabel('Papers per Journal')
    axes[1].set_ylabel('Number of Journals')
    axes[1].set_title('Distribution of Papers per Journal', fontsize=14, fontweight='bold')
    axes[1].set_yscale('log')
    axes[1].grid(True, alpha=0.3)
    
    # Add statistics to the plot
    mean_papers = journal_sizes.mean()
    median_papers = np.median(journal_sizes)
    axes[1].axvline(mean_papers, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_papers:.1f}')
    axes[1].axvline(median_papers, color='orange', linestyle='--', linewidth=2, label=f'Median: {median_papers:.1f}')
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()
    
    return journal_counts

if df_clean is not None:
    journal_analysis = analyze_journals(df_clean, top_n=15)


## Text Analysis

Now let's analyze the content of papers through title and abstract text mining:

In [None]:
def analyze_text_content(df, top_words=20):
    """Analyze text content of papers"""
    
    print(f"üî§ TEXT CONTENT ANALYSIS")
    print("=" * 40)
    
    if 'title' not in df.columns:
        print("‚ùå No title data available")
        return None
    
    # Combine all titles for analysis
    all_titles = ' '.join(df['title'].dropna().astype(str))
    
    print(f"üìù Text corpus statistics:")
    print(f"   Total papers with titles: {df['title'].notna().sum():,}")
    print(f"   Total characters in titles: {len(all_titles):,}")
    print(f"   Total words in titles: {len(all_titles.split()):,}")
    
    # Word frequency analysis
    print(f"\nüîç Performing word frequency analysis...")
    
    # Clean and extract words
    words = re.findall(r'\b[a-zA-Z]{3,}\b', all_titles.lower())
    
    # Define comprehensive stop words for medical/scientific text
    stop_words = {
        'the', 'and', 'for', 'are', 'with', 'this', 'that', 'from', 'they', 'been', 
        'have', 'has', 'had', 'was', 'were', 'will', 'would', 'could', 'should',
        'can', 'may', 'might', 'must', 'shall', 'not', 'but', 'what', 'when',
        'where', 'who', 'how', 'why', 'which', 'than', 'then', 'now', 'here',
        'there', 'more', 'most', 'much', 'many', 'some', 'any', 'all', 'both',
        'each', 'few', 'other', 'such', 'only', 'own', 'same', 'also', 'just',
        'being', 'over', 'through', 'during', 'before', 'after', 'above', 'below',
        'between', 'among', 'into', 'within', 'without', 'under', 'again', 'once'
    }
    
    # Filter words
    filtered_words = [word for word in words if word not in stop_words and len(word) > 3]
    
    print(f"   Words after filtering: {len(filtered_words):,}")
    
    # Calculate frequency
    word_freq = Counter(filtered_words)
    top_words_list = word_freq.most_common(top_words)
    
    print(f"\nüî§ Top {top_words} words in paper titles:")
    for i, (word, count) in enumerate(top_words_list, 1):
        percentage = (count / len(filtered_words)) * 100
        print(f"{i:2d}. {word:<15} {count:>6,} occurrences ({percentage:4.1f}%)")
    
    # Create visualizations
    fig, axes = plt.subplots(2, 2, figsize=(18, 12))
    
    # Word frequency bar chart
    words_df = pd.DataFrame(top_words_list, columns=['Word', 'Frequency'])
    
    axes[0, 0].bar(words_df['Word'], words_df['Frequency'], color='lightgreen', alpha=0.7)
    axes[0, 0].set_title(f'Top {top_words} Most Frequent Words in Titles', fontweight='bold')
    axes[0, 0].set_xlabel('Words')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].tick_params(axis='x', rotation=45)
    axes[0, 0].grid(axis='y', alpha=0.3)
    
    # Word cloud
    try:
        titles_text = ' '.join(filtered_words)
        wordcloud = WordCloud(width=800, height=400, 
                             background_color='white',
                             max_words=100,
                             collocations=False).generate(titles_text)
        
        axes[0, 1].imshow(wordcloud, interpolation='bilinear')
        axes[0, 1].axis('off')
        axes[0, 1].set_title('Word Cloud of Paper Titles', fontweight='bold')
        
    except ImportError:
        axes[0, 1].text(0.5, 0.5, 'WordCloud not available\nInstall: pip install wordcloud', 
                       ha='center', va='center', transform=axes[0, 1].transAxes, fontsize=12)
        axes[0, 1].set_title('Word Cloud (Not Available)')
    except Exception as e:
        axes[0, 1].text(0.5, 0.5, f'Error creating word cloud:\n{str(e)}', 
                       ha='center', va='center', transform=axes[0, 1].transAxes, fontsize=10)
    
    # Abstract length analysis (if available)
    if 'abstract_word_count' in df.columns:
        abstract_lengths = df['abstract_word_count']
        
        axes[1, 0].hist(abstract_lengths[abstract_lengths > 0], bins=50, 
                       color='lightblue', alpha=0.7, edgecolor='black')
        axes[1, 0].set_xlabel('Abstract Length (words)')
        axes[1, 0].set_ylabel('Number of Papers')
        axes[1, 0].set_title('Distribution of Abstract Lengths', fontweight='bold')
        axes[1, 0].grid(True, alpha=0.3)
        
        # Add statistics
        mean_len = abstract_lengths.mean()
        median_len = abstract_lengths.median()
        axes[1, 0].axvline(mean_len, color='red', linestyle='--', label=f'Mean: {mean_len:.0f}')
        axes[1, 0].axvline(median_len, color='orange', linestyle='--', label=f'Median: {median_len:.0f}')
        axes[1, 0].legend()
    else:
        axes[1, 0].text(0.5, 0.5, 'Abstract length data\nnot available', 
                       ha='center', va='center', transform=axes[1, 0].transAxes, fontsize=12)
        axes[1, 0].set_title('Abstract Length Distribution')
    
    # Title length analysis
    if 'title_length' in df.columns:
        title_lengths = df['title_length']
        
        axes[1, 1].boxplot(title_lengths, patch_artist=True, 
                          boxprops=dict(facecolor='lightcoral', alpha=0.7))
        axes[1, 1].set_ylabel('Title Length (characters)')
        axes[1, 1].set_title('Distribution of Title Lengths', fontweight='bold')
        axes[1, 1].grid(True, alpha=0.3)
        
        # Add statistics text
        mean_title_len = title_lengths.mean()
        median_title_len = title_lengths.median()
        axes[1, 1].text(0.02, 0.98, f'Mean: {mean_title_len:.0f}\nMedian: {median_title_len:.0f}', 
                       transform=axes[1, 1].transAxes, fontsize=10, 
                       verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
    else:
        axes[1, 1].text(0.5, 0.5, 'Title length data\nnot available', 
                       ha='center', va='center', transform=axes[1, 1].transAxes, fontsize=12)
        axes[1, 1].set_title('Title Length Distribution')
    
    plt.tight_layout()
    plt.show()
    
    return word_freq, top_words_list

if df_clean is not None:
    text_analysis, top_words_result = analyze_text_content(df_clean, top_words=20)

## Advanced Text Analysis

Let's dive deeper into content patterns and research themes:

In [None]:
def advanced_text_analysis(df):
    """Perform advanced text analysis including COVID-specific terms"""
    
    print("üî¨ ADVANCED TEXT ANALYSIS")
    print("=" * 30)
    
    if 'title' not in df.columns:
        return None
    
    # Define COVID-related terms to search for
    covid_terms = {
        'virus_terms': ['covid', 'coronavirus', 'sars-cov-2', 'pandemic', 'epidemic', 'viral'],
        'medical_terms': ['treatment', 'vaccine', 'therapy', 'clinical', 'patient', 'hospital'],
        'research_terms': ['study', 'analysis', 'research', 'investigation', 'assessment', 'review'],
        'impact_terms': ['impact', 'effect', 'outcome', 'mortality', 'morbidity', 'risk'],
        'social_terms': ['social', 'economic', 'psychological', 'mental', 'lockdown', 'isolation']
    }
    
    # Combine all titles for analysis
    all_titles_lower = ' '.join(df['title'].dropna().astype(str).str.lower())
    
    print("üéØ COVID-19 research theme analysis:")
    
    theme_counts = {}
    for theme, terms in covid_terms.items():
        theme_name = theme.replace('_', ' ').title()
        total_mentions = sum(all_titles_lower.count(term) for term in terms)
        papers_with_theme = sum((df['title'].str.contains(term, case=False, na=False)).sum() for term in terms)
        
        theme_counts[theme_name] = {
            'mentions': total_mentions,
            'papers': papers_with_theme
        }
        
        print(f"   {theme_name:<15}: {total_mentions:>4} mentions, {papers_with_theme:>4} papers")
    
    # Visualize theme distribution
    themes = list(theme_counts.keys())
    mentions = [theme_counts[theme]['mentions'] for theme in themes]
    papers = [theme_counts[theme]['papers'] for theme in themes]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Theme mentions
    bars1 = ax1.bar(themes, mentions, color='skyblue', alpha=0.7)
    ax1.set_title('COVID-19 Research Theme Mentions', fontweight='bold')
    ax1.set_ylabel('Total Mentions')
    ax1.tick_params(axis='x', rotation=45)
    ax1.grid(axis='y', alpha=0.3)
    
    # Add value labels
    for bar, value in zip(bars1, mentions):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(mentions)*0.01,
                f'{value}', ha='center', va='bottom', fontweight='bold')
    
    # Papers with themes
    bars2 = ax2.bar(themes, papers, color='lightcoral', alpha=0.7)
    ax2.set_title('Papers Addressing Each Theme', fontweight='bold')
    ax2.set_ylabel('Number of Papers')
    ax2.tick_params(axis='x', rotation=45)
    ax2.grid(axis='y', alpha=0.3)
    
    # Add value labels
    for bar, value in zip(bars2, papers):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(papers)*0.01,
                f'{value}', ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    return theme_counts

if df_clean is not None:
    advanced_analysis = advanced_text_analysis(df_clean)

### Create a word cloud from paper titles

In [None]:
def create_title_wordcloud(df):
    print("\n‚òÅÔ∏è CREATING TITLE WORD CLOUD")
    print("=" * 30)
    
    if 'title' not in df.columns:
        print("‚ùå No title data available")
        return None
    
    try:
        # Combine all titles
        all_titles = ' '.join(df['title'].dropna().astype(str))
        
        # Create word cloud
        wordcloud = WordCloud(width=1200, height=600, 
                             background_color='white',
                             max_words=100,
                             collocations=False).generate(all_titles)
        
        # Display word cloud
        plt.figure(figsize=(15, 8))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Paper Titles', fontsize=18, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        print("‚úÖ Word cloud created successfully")
        return wordcloud
        
    except ImportError:
        print("‚ùå WordCloud library not available. Install with: pip install wordcloud")
        return None

### Analyze distribution of papers by source

In [None]:
def analyze_sources(df):
    print("\nüìö SOURCE DISTRIBUTION ANALYSIS")
    print("=" * 40)
    
    # Try different possible source columns
    source_columns = ['source_x', 'source', 'database', 'origin']
    source_col = None
    
    for col in source_columns:
        if col in df.columns:
            source_col = col
            break
    
    if source_col is None:
        print("‚ùå No source column found")
        return None
    
    # Count papers by source
    source_counts = df[source_col].value_counts()
    
    print(f"üìä Papers by source:")
    for source, count in source_counts.items():
        percentage = (count / len(df)) * 100
        print(f"   ‚Ä¢ {source}: {count:,} papers ({percentage:.1f}%)")
    
    # Plot source distribution
    plt.figure(figsize=(10, 8))
    colors = plt.cm.Set3(np.linspace(0, 1, len(source_counts)))
    plt.pie(source_counts.values, labels=source_counts.index, autopct='%1.1f%%',
            colors=colors, startangle=90)
    plt.title('Distribution of Papers by Source', fontsize=16, fontweight='bold')
    plt.axis('equal')
    plt.tight_layout()
    plt.show()
    
    return source_counts


###  Generate comprehensive summary statistics

In [None]:
def generate_summary_stats(df):
    print("\nüìà SUMMARY STATISTICS")
    print("=" * 30)
    
    total_papers = len(df)
    print(f"üìÑ Total papers: {total_papers:,}")
    
    if 'abstract_word_count' in df.columns:
        avg_abstract_length = df['abstract_word_count'].mean()
        print(f"üìù Average abstract length: {avg_abstract_length:.0f} words")
    
    if 'publication_year' in df.columns:
        year_range = f"{df['publication_year'].min():.0f} - {df['publication_year'].max():.0f}"
        print(f"üìÖ Publication year range: {year_range}")
    
    if 'journal' in df.columns:
        unique_journals = df['journal'].nunique()
        print(f"üì∞ Unique journals: {unique_journals:,}")
    
    if 'authors' in df.columns:
        papers_with_authors = df['authors'].notna().sum()
        print(f"üë• Papers with author info: {papers_with_authors:,} ({papers_with_authors/total_papers*100:.1f}%)")

## Main analysis workflow

In [None]:
def main():
    print("ü¶† CORD-19 METADATA ANALYSIS")
    print("=" * 50)
    
    # Part 1: Data Loading and Basic Exploration
    print("\n" + "="*20 + " PART 1: DATA LOADING " + "="*20)
    df = load_cord19_data()
    if df is None:
        return
    
    dtypes, missing_counts = explore_basic_info(df)
    missing_analysis = analyze_missing_data(df)
    
    # Part 2: Data Cleaning and Preparation
    print("\n" + "="*20 + " PART 2: DATA CLEANING " + "="*20)
    df_clean = clean_data(df)
    
    # Part 3: Data Analysis and Visualization
    print("\n" + "="*20 + " PART 3: ANALYSIS & VISUALIZATION " + "="*20)
    yearly_analysis = analyze_publications_by_year(df_clean)
    journal_analysis = analyze_top_journals(df_clean)
    word_analysis = analyze_title_words(df_clean)
    wordcloud = create_title_wordcloud(df_clean)
    source_analysis = analyze_sources(df_clean)
    
    # Generate summary
    generate_summary_stats(df_clean)
    
    print("\n‚úÖ Analysis completed!")
    return df_clean

if __name__ == "__main__":
    df_final = main()