In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Load the data
df = pd.read_csv('metadata.csv', low_memory=False)

# Basic exploration
print("Dataset shape:", df.shape)
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum().sort_values(ascending=False).head(10))

# Data cleaning
df_clean = df.copy()
df_clean['publish_time'] = pd.to_datetime(df_clean['publish_time'], errors='coerce')
df_clean['year'] = df_clean['publish_time'].dt.year
df_clean['abstract_word_count'] = df_clean['abstract'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)
df_clean['journal'] = df_clean['journal'].fillna('Unknown Journal')

# Analysis
# 1. Publications over time
yearly_counts = df_clean['year'].value_counts().sort_index()
plt.figure()
plt.plot(yearly_counts.index, yearly_counts.values, marker='o', linewidth=2)
plt.xlabel('Year')
plt.ylabel('Number of Publications')
plt.title('COVID-19 Publications Over Time')
plt.grid(True)
plt.savefig('publications_over_time.png')
plt.show()

# 2. Top journals
top_journals = df_clean['journal'].value_counts().head(10)
plt.figure()
plt.barh(top_journals.index, top_journals.values)
plt.xlabel('Number of Publications')
plt.title('Top 10 Journals by COVID-19 Publications')
plt.tight_layout()
plt.savefig('top_journals.png')
plt.show()

# 3. Word cloud of titles
text = ' '.join(df_clean['title'].dropna().values)
text = re.sub(r'[^a-zA-Z\s]', '', text)

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Paper Titles')
plt.savefig('title_wordcloud.png')
plt.show()

# 4. Source distribution
source_counts = df_clean['source_x'].value_counts().head(10)
plt.figure()
plt.pie(source_counts.values, labels=source_counts.index, autopct='%1.1f%%')
plt.title('Top 10 Sources of Papers')
plt.savefig('source_distribution.png')
plt.show()

# Additional analysis: Abstract word count distribution
plt.figure()
plt.hist(df_clean[df_clean['abstract_word_count'] > 0]['abstract_word_count'], bins=50)
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.title('Distribution of Abstract Word Counts')
plt.savefig('abstract_wordcount.png')
plt.show()