# CORD-19 Metadata Analysis
Author: [Lawrence Njoroge]  


This notebook performs:
- Data loading & exploration
- Data cleaning & preparation
- Data analysis & visualization
- (Optional) Streamlit app for interactivity

In [None]:
# ---- Import required libraries ----
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import re

## Part 1: Data Loading and Basic Exploration

In [None]:
df = pd.read_csv("metadata.csv", low_memory=False)
df.head()

In [None]:
print("Dataset shape:", df.shape)
df.info()
df.describe()
df.isnull().sum().head(20)

## Part 2: Data Cleaning and Preparation

In [None]:
df_clean = df.dropna(subset=["title", "abstract", "publish_time"]).copy()
df_clean["publish_time"] = pd.to_datetime(df_clean["publish_time"], errors="coerce")
df_clean["year"] = df_clean["publish_time"].dt.year
df_clean["abstract_word_count"] = df_clean["abstract"].apply(lambda x: len(str(x).split()))
df_clean.head()

## Part 3: Data Analysis and Visualization

In [None]:
papers_per_year = df_clean["year"].value_counts().sort_index()
plt.figure(figsize=(8,5))
papers_per_year.plot(kind="line", marker="o", color="blue")
plt.title("Number of Publications Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Papers")
plt.grid(True)
plt.show()

In [None]:
top_journals = df_clean["journal"].value_counts().head(10)
plt.figure(figsize=(10,5))
top_journals.plot(kind="bar", color="green")
plt.title("Top 10 Journals Publishing COVID-19 Research")
plt.xlabel("Journal")
plt.ylabel("Number of Papers")
plt.xticks(rotation=45, ha="right")
plt.show()

In [None]:
titles_text = " ".join(df_clean["title"].dropna()).lower()
words = re.findall(r"\b\w+\b", titles_text)
common_words = Counter(words).most_common(20)
common_words

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(titles_text)
plt.figure(figsize=(12,6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Paper Titles")
plt.show()

In [None]:
plt.figure(figsize=(10,6))
df_clean["source_x"].value_counts().plot(kind="barh", color="purple")
plt.title("Distribution of Papers by Source")
plt.xlabel("Number of Papers")
plt.ylabel("Source")
plt.show()