In [None]:
# Analysis of CORD-19 Metadata
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud

# Load dataset
df = pd.read_csv("../data/metadata.csv")

# Basic info
print("Shape:", df.shape)
print(df.info())
print(df.head())

# Handle missing values
df['publish_time'] = pd.to_datetime(df['publish_time'], errors="coerce")
df['year'] = df['publish_time'].dt.year

# Publications by Year
year_counts = df['year'].value_counts().sort_index()
plt.figure(figsize=(8,5))
sns.barplot(x=year_counts.index, y=year_counts.values)
plt.title("Publications by Year")
plt.show()

# Top Journals
top_journals = df['journal'].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(y=top_journals.index, x=top_journals.values)
plt.title("Top 10 Journals")
plt.show()

# Word frequency in titles
words = " ".join(df['title'].dropna()).lower().split()
common_words = Counter(words).most_common(20)
print("Most common words in titles:", common_words)

# Wordcloud
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(words))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
