import the necessary libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud

load into a dataframe

In [None]:
df = pd.read_csv(
    r"C:\\Users\\user\\Downloads\\metadata.csv.zip",  # <-- Path to your dataset
    nrows=500,                                    # Load only first 500 rows
    low_memory=False                              # Avoid dtype guessing issues
)

explore the structure

In [None]:
print(df.shape)     # Check size
print(df.info())    # Data types, null counts
print(df.head())    # Preview data

look at missing values and summary statistics

In [None]:
print(df.isnull().sum())   # Count of nulls per column
print(df.describe(include='all'))     # Summary stats for numeric and categorical columns

# Data cleaning and preparation

handle missing values

In [None]:
missing_values = df.isnull().sum().sort_values(ascending=False)   # Sort columns by missing values
print(missing_values.head())  # Print top columns with most missing values

clean date columns

In [None]:
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce') # Convert to datetime
df['year'] = df['publish_time'].dt.year  # Extract year
df['month'] = df['publish_time'].dt.month # Extract month
df['day'] = df['publish_time'].dt.day     # Extract day

save a cleaned version

In [None]:
df_clean = df.copy()
df_clean.to_csv("metadata_clean.csv", index=False)

# Data analysis and visualisation

publication per year

In [None]:
year_counts = df_clean['year'].value_counts().sort_index()  # Count publications per year
year_counts.plot(kind='bar', title='Publications by Year')  # Bar chart of publications per year
plt.xlabel('Year')
plt.ylabel('Number of Publications')
plt.show()

top journas

In [None]:
top_journals = df_clean['journal'].value_counts().head(10)
top_journals.plot(kind='barh', title='Top Journals')

frequent words in titles

In [None]:
from collections import Counter
words = " ".join(df_clean['title'].dropna()).lower().split()
common_words = Counter(words).most_common(20)
print(common_words)

visalization

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=800, height=400).generate(" ".join(df_clean['title'].dropna()))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()


# Streamlit app

basic layout

In [None]:
import streamlit as st
import matplotlib.pyplot as plt

st.title("CORD-19 Data Explorer")
st.write("Explore COVID-19 research publications")


interaction elements

In [None]:
years = df_clean['year'].dropna().astype(int).unique()
min_year, max_year = years.min(), years.max() # Get min and max year
year_range = st.slider("Select Year Range", min_year, max_year, (2020, 2021))

filtered = df_clean[(df_clean['year'] >= year_range[0]) & (df_clean['year'] <= year_range[1])] # Filter by year range
st.write(filtered.head())


show charts in the app

In [None]:
st.bar_chart(filtered['year'].value_counts().sort_index())