Import Libraries


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

sns.set(style="whitegrid")


Load the sample dataset

In [None]:
# Load only the smaller dataset
df = pd.read_csv("metadata_sample.csv")

# Quick check
print("Shape:", df.shape)
df.head()


Basic exploration

In [None]:
# Data info
print(df.info())

# Missing values
print(df.isnull().sum().head(15))

# Summary statistics
df.describe()


Data cleaning

In [None]:
# Drop rows missing important fields
df = df.dropna(subset=['title', 'publish_time'])

# Convert publish_time to datetime
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')

# Extract year
df['year'] = df['publish_time'].dt.year

# Add abstract word count
df['abstract_word_count'] = df['abstract'].fillna("").apply(lambda x: len(x.split()))

df[['title', 'year', 'abstract_word_count']].head()


Publications by year

In [None]:
year_counts = df['year'].value_counts().sort_index()

plt.figure(figsize=(8,5))
sns.barplot(x=year_counts.index, y=year_counts.values, color="skyblue")
plt.title("Publications by Year")
plt.xlabel("Year")
plt.ylabel("Number of Publications")
plt.xticks(rotation=45)
plt.show()


Top journals

In [None]:
top_journals = df['journal'].value_counts().head(10)

plt.figure(figsize=(8,5))
sns.barplot(y=top_journals.index, x=top_journals.values, color="orange")
plt.title("Top Journals Publishing COVID-19 Papers")
plt.xlabel("Number of Publications")
plt.ylabel("Journal")
plt.show()


Word frequency in titles

In [None]:
# Join all titles
words = " ".join(df['title'].dropna()).lower()

# Extract words
words = re.findall(r'\w+', words)

# Count most common
common_words = Counter(words).most_common(15)
common_words


Publications by source

In [None]:
source_counts = df['source_x'].value_counts().head(10)

plt.figure(figsize=(8,5))
sns.barplot(y=source_counts.index, x=source_counts.values, color="green")
plt.title("Top Sources of Publications")
plt.xlabel("Number of Publications")
plt.ylabel("Source")
plt.show()


Save cleaned dataset

In [None]:
# Save cleaned dataset for Streamlit app
df.to_csv("metadata_cleaned.csv", index=False)
print("Saved as metadata_cleaned.csv")
