In [None]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from datetime import datetime
import numpy as np

# Set page configuration
st.set_page_config(
    page_title="CORD-19 Research Analysis",
    page_icon="📚",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Load data
@st.cache_data
def load_data():
    try:
        df = pd.read_csv('data/metadata.csv')
        # Data cleaning
        df_cleaned = df.copy()
        df_cleaned['abstract'] = df_cleaned['abstract'].fillna('')
        df_cleaned['publish_time'] = pd.to_datetime(df_cleaned['publish_time'], errors='coerce')
        df_cleaned['year'] = df_cleaned['publish_time'].dt.year
        df_cleaned['abstract_word_count'] = df_cleaned['abstract'].apply(lambda x: len(str(x).split()))
        return df_cleaned
    except FileNotFoundError:
        st.error("Please download the metadata.csv file from Kaggle and place it in the data/ folder")
        st.stop()

df = load_data()

# Sidebar
st.sidebar.title("CORD-19 Analysis Dashboard")
st.sidebar.markdown("Explore COVID-19 research papers metadata")

# Filters
st.sidebar.header("Filters")
year_range = st.sidebar.slider(
    "Select Year Range",
    min_value=int(df['year'].min()),
    max_value=int(df['year'].max()),
    value=(2020, 2021)
)

min_word_count = st.sidebar.slider(
    "Minimum Abstract Word Count",
    min_value=0,
    max_value=500,
    value=50
)

source_filter = st.sidebar.multiselect(
    "Select Sources",
    options=df['source_x'].dropna().unique(),
    default=df['source_x'].dropna().unique()[:3]
)

# Apply filters
filtered_df = df[
    (df['year'] >= year_range[0]) & 
    (df['year'] <= year_range[1]) &
    (df['abstract_word_count'] >= min_word_count)
]

if source_filter:
    filtered_df = filtered_df[filtered_df['source_x'].isin(source_filter)]

# Main content
st.title("📊 CORD-19 Research Dataset Analysis")
st.markdown("""
This dashboard provides insights into the COVID-19 Open Research Dataset (CORD-19) metadata.
Explore publications over time, top journals, and word frequency in titles.
""")

# Key metrics
col1, col2, col3, col4 = st.columns(4)
with col1:
    st.metric("Total Papers", len(filtered_df))
with col2:
    st.metric("Unique Journals", filtered_df['journal'].nunique())
with col3:
    st.metric("Average Abstract Length", f"{filtered_df['abstract_word_count'].mean():.1f} words")
with col4:
    st.metric("Time Range", f"{year_range[0]} - {year_range[1]}")

# Tabs
tab1, tab2, tab3, tab4 = st.tabs(["Overview", "Publications", "Word Analysis", "Data Sample"])

with tab1:
    st.header("Dataset Overview")
    
    col1, col2 = st.columns(2)
    
    with col1:
        st.subheader("Publications by Year")
        yearly_counts = filtered_df['year'].value_counts().sort_index()
        fig, ax = plt.subplots(figsize=(10, 6))
        yearly_counts.plot(kind='bar', ax=ax, color='skyblue')
        ax.set_title('Publications by Year')
        ax.set_xlabel('Year')
        ax.set_ylabel('Number of Publications')
        plt.xticks(rotation=45)
        st.pyplot(fig)
    
    with col2:
        st.subheader("Top Journals")
        top_journals = filtered_df['journal'].value_counts().head(10)
        fig, ax = plt.subplots(figsize=(10, 6))
        top_journals.plot(kind='bar', ax=ax, color='lightgreen')
        ax.set_title('Top 10 Journals')
        ax.set_xlabel('Journal')
        ax.set_ylabel('Number of Publications')
        plt.xticks(rotation=45)
        st.pyplot(fig)

with tab2:
    st.header("Publication Analysis")
    
    col1, col2 = st.columns(2)
    
    with col1:
        st.subheader("Source Distribution")
        source_counts = filtered_df['source_x'].value_counts().head(10)
        fig, ax = plt.subplots(figsize=(8, 8))
        ax.pie(source_counts.values, labels=source_counts.index, autopct='%1.1f%%')
        ax.set_title('Top 10 Sources')
        st.pyplot(fig)
    
    with col2:
        st.subheader("Abstract Length Distribution")
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.hist(filtered_df['abstract_word_count'], bins=50, color='orange', alpha=0.7)
        ax.set_title('Distribution of Abstract Word Count')
        ax.set_xlabel('Word Count')
        ax.set_ylabel('Frequency')
        st.pyplot(fig)

with tab3:
    st.header("Word Analysis")
    
    # Word cloud
    st.subheader("Word Cloud of Paper Titles")
    all_titles = ' '.join(filtered_df['title'].dropna().astype(str))
    if all_titles.strip():
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_titles)
        fig, ax = plt.subplots(figsize=(12, 6))
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.axis('off')
        ax.set_title('Word Cloud of Paper Titles')
        st.pyplot(fig)
    else:
        st.warning("No titles available for the selected filters")

with tab4:
    st.header("Data Sample")
    st.subheader("Filtered Dataset Preview")
    
    # Show sample data
    st.dataframe(
        filtered_df[['title', 'journal', 'publish_time', 'source_x', 'abstract_word_count']].head(20),
        height=400
    )
    
    # Download button
    csv = filtered_df.to_csv(index=False)
    st.download_button(
        label="Download filtered data as CSV",
        data=csv,
        file_name="filtered_cord19_data.csv",
        mime="text/csv"
    )

# Footer
st.markdown("---")
st.markdown("""
**Data Source**: [CORD-19 Research Dataset on Kaggle](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge)
""")