📊 CORD-19 Dataset Analysis This notebook performs a basic analysis of the CORD-19 dataset (metadata.csv).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud


In [None]:
# Load dataset (adjust path if needed)
df = pd.read_csv("CORD-19-resaerch-challenge-metadata.csv")

# Basic info
print(df.shape)
print(df.info())
print(df.isnull().sum().head(10))
df.head()

In [None]:
# Drop rows without publish_time or title
df = df.dropna(subset=["publish_time", "title"])

# Convert publish_time to datetime and extract year
df["publish_time"] = pd.to_datetime(df["publish_time"], errors="coerce")
df["year"] = df["publish_time"].dt.year

# Add abstract word count
df["abstract_word_count"] = df["abstract"].fillna("").apply(lambda x: len(x.split()))
df.head()

In [None]:
# Publications per year


year_counts = df["year"].value_counts().sort_index()
plt.figure(figsize=(8,5))
plt.bar(year_counts.index, year_counts.values)
plt.title("Publications by Year")
plt.xlabel("Year")
plt.ylabel("Number of Papers")
plt.show()


In [None]:
# Top journals
from matplotlib import pyplot as plt


top_journals = df["journal"].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(y=top_journals.index, x=top_journals.values)
plt.title("Top Journals Publishing COVID-19 Research")
plt.xlabel("Number of Papers")
plt.ylabel("Journal")
plt.show()

In [None]:
# Word cloud from titles


text = " ".join(df["title"].dropna().tolist())
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)

plt.figure(figsize=(10,6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Most Common Words in Paper Titles")
plt.show()