In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from datetime import datetime
import re

# Load the data
df = pd.read_csv('../data/metadata.csv')

# Basic exploration
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst 5 rows:")
print(df.head())

# Data types and missing values
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())

# Basic statistics
print("\nNumerical columns statistics:")
print(df.describe())

# Data cleaning
# Handle missing values
df_cleaned = df.copy()

# Fill missing abstracts with empty string
df_cleaned['abstract'] = df_cleaned['abstract'].fillna('')

# Convert publish_time to datetime
df_cleaned['publish_time'] = pd.to_datetime(df_cleaned['publish_time'], errors='coerce')

# Extract year
df_cleaned['year'] = df_cleaned['publish_time'].dt.year

# Create abstract word count
df_cleaned['abstract_word_count'] = df_cleaned['abstract'].apply(lambda x: len(str(x).split()))

# Analysis and visualizations
# Publications over time
plt.figure(figsize=(12, 6))
df_cleaned['year'].value_counts().sort_index().plot(kind='bar')
plt.title('Publications by Year')
plt.xlabel('Year')
plt.ylabel('Number of Publications')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../publications_by_year.png')
plt.show()

# Top journals
top_journals = df_cleaned['journal'].value_counts().head(10)
plt.figure(figsize=(12, 6))
top_journals.plot(kind='bar')
plt.title('Top 10 Journals by Publication Count')
plt.xlabel('Journal')
plt.ylabel('Number of Publications')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../top_journals.png')
plt.show()

# Word cloud for titles
all_titles = ' '.join(df_cleaned['title'].dropna().astype(str))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_titles)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Paper Titles')
plt.tight_layout()
plt.savefig('../title_wordcloud.png')
plt.show()

# Source distribution
plt.figure(figsize=(10, 6))
df_cleaned['source_x'].value_counts().head(10).plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of Papers by Source')
plt.ylabel('')
plt.tight_layout()
plt.savefig('../source_distribution.png')
plt.show()