In [None]:
import pandas as pd

# Load the metadata.csv file
df = pd.read_csv('metadata.csv')

# Show the first few rows
df.head()


  df = pd.read_csv('metadata.csv')


In [None]:
# How many rows and columns?
print("Shape:", df.shape)

# What are the column names?
print("Columns:", df.columns.tolist())
# What types of data are in each column?
df.info()
# How many missing values in each column?
df.isnull().sum()
# Look at some titles and abstracts
df[['title', 'abstract']].head(10)



In [None]:
# Count missing values in each column
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print("Missing values:\n", missing_values)


In [None]:
df[['title', 'abstract', 'publish_time', 'journal']].head(10)


In [None]:
# Convert publish_time to datetime
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')

# Extract year
df['year'] = df['publish_time'].dt.year

# Preview the new column
df[['publish_time', 'year']].head()


In [None]:
# Add abstract word count
df['abstract_word_count'] = df['abstract'].fillna('').apply(lambda x: len(x.split()))

# Preview
df[['abstract', 'abstract_word_count']].head()


In [None]:
# Show columns with more than 50% missing values
missing_ratio = df.isnull().mean()
high_missing = missing_ratio[missing_ratio > 0.5]
print("Columns with >50% missing values:\n", high_missing)


In [None]:
# Drop columns with too many missing values
df_cleaned = df.drop(columns=high_missing.index)

# Drop rows missing critical info like title or publish_time
df_cleaned = df_cleaned.dropna(subset=['title', 'publish_time'])

# Confirm cleanup
print("Cleaned shape:", df_cleaned.shape)


In [None]:
df_cleaned['journal'] = df_cleaned['journal'].fillna('Unknown')


In [None]:
print(df_cleaned['year'].value_counts().sort_index())


In [None]:
print(df_cleaned['year'].value_counts().sort_index())


In [None]:
print(df_cleaned[['abstract_word_count']].describe())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count papers by year
year_counts = df_cleaned['year'].value_counts().sort_index()

# Plot
plt.figure(figsize=(10,6))
sns.barplot(x=year_counts.index, y=year_counts.values, palette='Blues')
plt.title('Number of Publications by Year')
plt.xlabel('Year')
plt.ylabel('Number of Papers')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Count top 10 journals
top_journals = df_cleaned['journal'].value_counts().head(10)

# Plot
plt.figure(figsize=(10,6))
sns.barplot(y=top_journals.index, x=top_journals.values, palette='Greens')
plt.title('Top 10 Journals Publishing COVID-19 Research')
plt.xlabel('Number of Papers')
plt.ylabel('Journal')
plt.tight_layout()
plt.show()


In [None]:
from collections import Counter
import re

# Combine all titles into one string
titles = df_cleaned['title'].dropna().str.lower().str.cat(sep=' ')
words = re.findall(r'\b[a-z]{4,}\b', titles)  # words with 4+ letters
common_words = Counter(words).most_common(20)

# Prepare data
words, counts = zip(*common_words)

# Plot
plt.figure(figsize=(10,6))
sns.barplot(x=list(words), y=list(counts), palette='Purples')
plt.title('Most Common Words in Paper Titles')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Count by source
source_counts = df_cleaned['source_x'].value_counts().head(10)

# Plot
plt.figure(figsize=(10,6))
sns.barplot(y=source_counts.index, x=source_counts.values, palette='Oranges')
plt.title('Top Sources of COVID-19 Papers')
plt.xlabel('Number of Papers')
plt.ylabel('Source')
plt.tight_layout()
plt.show()
