In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Try importing wordcloud
try:
    from wordcloud import WordCloud
    WORDCLOUD_AVAILABLE = True
except ImportError:
    WORDCLOUD_AVAILABLE = False
    print("⚠️ WordCloud not installed. Skipping word cloud visualization.")

# Load dataset
df = pd.read_csv("metadata_sample.csv")

# Preview data
print("Dataset shape:", df.shape)
display(df.head())

# Handle missing values
df = df.dropna(subset=["title", "publish_time"])
df["year"] = pd.to_datetime(df["publish_time"], errors="coerce").dt.year

# --- Analysis ---
# 1. Publications per year
year_counts = df["year"].value_counts().sort_index()
plt.figure(figsize=(8,5))
plt.bar(year_counts.index, year_counts.values)
plt.title("Publications by Year")
plt.xlabel("Year")
plt.ylabel("Number of Papers")
plt.show()

# 2. Top Journals
top_journals = df["journal"].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(y=top_journals.index, x=top_journals.values)
plt.title("Top Journals")
plt.xlabel("Number of Papers")
plt.show()

# 3. Word Cloud (optional)
if WORDCLOUD_AVAILABLE:
    text = " ".join(df["title"].dropna())
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
    plt.figure(figsize=(10,6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Most Common Words in Titles")
    plt.show()
else:
    print("Skipping word cloud plot because WordCloud is not installed.")
