## Data Loading and Basic Exploration
Load it into a pandas DataFrame
Examine the first few rows and data structure

In [None]:
import pandas as pd
df = pd.read_csv("metadata.csv", low_memory=False)
print(df.head())

Checking the DataFrame dimensions (rows, columns)

In [None]:
print("Shape (rows, columns):", df.shape)

Identifying data types of each column

In [None]:
print("Column data types:")
print(df.dtypes)

Checking for missing values in important columns

In [None]:
important_cols = ["title", "abstract", "publish_time", "journal", "authors"]
print("Missing values in important columns:")
print(df[important_cols].isnull().sum())

Generating basic statistics for numerical columns

In [None]:
print("Basic statistics for numerical columns:")
print(df.describe())

## Part 2: Data Cleaning and Preparation

Identifying columns with many missing values

In [None]:
missing_counts = df.isnull().sum().sort_values(ascending=False)
print("Missing values per column:")
print(missing_counts.head(20)) 

Dropping columns with many missing values

In [None]:
threshold = 0.8
cols_to_drop = missing_counts[missing_counts > threshold * len(df)].index
df_cleaned = df.drop(columns=cols_to_drop)
print(f"Dropped {len(cols_to_drop)} columns due to many missing values.")
df_cleaned = df_cleaned.dropna(subset=["title", "abstract"], how="all")

Creating a cleaned version of the dataset

In [None]:
df_cleaned.to_csv("metadata_cleaned.csv", index=False)
print("Cleaned dataset saved as metadata_cleaned.csv")

Convert date columns to datetime format

In [None]:
# Convert publish_time to datetime
df_cleaned["publish_time"] = pd.to_datetime(df_cleaned["publish_time"], errors="coerce")
print(df_cleaned["publish_time"].head())

Extracting year from publication date for time-based analysis

In [None]:
df_cleaned["year"] = df_cleaned["publish_time"].dt.year.astype("Int64")
print(df_cleaned[["publish_time", "year"]].sample(5))

## Data Analysis and Visualization
Count papers by publication year

In [None]:
papers_per_year = df_cleaned["year"].value_counts().sort_index()
print(papers_per_year)

Identifying top journals publishing COVID-19 research

In [None]:
top_journals = df_cleaned["journal"].value_counts().head(10)
print(top_journals)

Ploting number of publications over time

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Filter out years with zero papers (if any)
papers_per_year = papers_per_year[papers_per_year > 0]

plt.figure(figsize=(10,6))
sns.barplot(x=papers_per_year.index, y=papers_per_year.values)
plt.title("Number of Publications Over Time", fontsize=16)
plt.xlabel("Publication Year")
plt.ylabel("Number of Papers")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

A bar chart of top publishing journals

In [None]:
top_journals = df_cleaned['journal'].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(y=top_journals.index, x=top_journals.values)
plt.title("Top 10 Journals Publishing COVID-19 Research", fontsize=16)
plt.xlabel("Number of Papers")
plt.ylabel("Journal")
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

A word cloud of paper titles

In [None]:
from wordcloud import WordCloud
import re
# Join all titles into one string
all_titles = " ".join(df_cleaned["title"].dropna().astype(str))
# Clean text
all_titles = re.sub(r"[^a-zA-Z ]", "", all_titles)
# Generate word cloud
wordcloud = WordCloud(width=1200, height=600, 
                      background_color='white', 
                      max_words=100 
                      ).generate(all_titles)

plt.figure(figsize=(15,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud of Paper Titles", fontsize=18)
plt.show()

Ploting distribution of paper counts by source

In [None]:
source_counts = df_cleaned['source_x'].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(y=source_counts.index, x=source_counts.values)
plt.title("Distribution of Papers by Source", fontsize=16)
plt.xlabel("Number of Papers")
plt.ylabel("Source")
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()