In [None]:
# analysis.ipynb (or analysis.py)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("metadata.csv")

print("Shape:", df.shape)
print(df.info())
print(df.head())

# Missing values
print("Missing values:\n", df.isnull().sum().head(20))
# Convert publish_time to datetime
df["publish_time"] = pd.to_datetime(df["publish_time"], errors="coerce")

# Extract year
df["year"] = df["publish_time"].dt.year

# Drop rows with missing title or abstract
df = df.dropna(subset=["title", "abstract"])

# Create word count column
df["abstract_word_count"] = df["abstract"].apply(lambda x: len(str(x).split()))
# 1. Publications by year
year_counts = df["year"].value_counts().sort_index()
plt.figure(figsize=(8,5))
sns.barplot(x=year_counts.index, y=year_counts.values, palette="viridis")
plt.title("Publications by Year")
plt.xticks(rotation=45)
plt.show()

# 2. Top journals
top_journals = df["journal"].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(y=top_journals.index, x=top_journals.values, palette="magma")
plt.title("Top Journals Publishing COVID-19 Papers")
plt.show()

# 3. Word frequency in titles
from collections import Counter
import re

all_words = " ".join(df["title"].dropna().str.lower()).split()
common_words = Counter([w for w in all_words if len(w) > 3]).most_common(15)

words, counts = zip(*common_words)
plt.figure(figsize=(8,5))
sns.barplot(x=list(counts), y=list(words))
plt.title("Most Frequent Words in Paper Titles")
plt.show()

# 4. Distribution by source_x
source_counts = df["source_x"].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(y=source_counts.index, x=source_counts.values)
plt.title("Paper Counts by Source")
plt.show()
