In [8]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

print("CC10 - REAL Machine Learning: Topic Discovery with K-Means")
print("=" * 60)

# ========================================
# STEP 1: FETCH UNLABELED PAPERS
# ========================================

base_url = "http://export.arxiv.org/api/query?"

year_ranges = [
    ("2015", "2016"),
    ("2018", "2019"),
    ("2022", "2023"),
    ("2024", "2024")
]

papers = []

for start_year, end_year in year_ranges:
    params = {
        "search_query": f"cat:cs.* AND submittedDate:[{start_year}0101 TO {end_year}1231]",
        "start": 0,
        "max_results": 250,
        "sortBy": "relevance",
        "sortOrder": "descending"
    }

    print(f"Fetching papers from {start_year}-{end_year}...")

    response = requests.get(base_url, params=params)

    if response.status_code != 200:
        print(f"Request failed: {response.status_code}")
        continue

    root = ET.fromstring(response.content)

    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
        paper = {
            "arxiv_id": entry.find("{http://www.w3.org/2005/Atom}id").text.split("/")[-1],
            "title": entry.find("{http://www.w3.org/2005/Atom}title").text.strip().replace("\n", " "),
            "year": int(entry.find("{http://www.w3.org/2005/Atom}published").text[:4]),
            "abstract": entry.find("{http://www.w3.org/2005/Atom}summary").text.strip().replace("\n", " ")
        }

        categories = [
            cat.get("term")
            for cat in entry.findall("{http://www.w3.org/2005/Atom}category")
        ]
        paper["categories"] = ", ".join(categories)

        papers.append(paper)

    time.sleep(1)

df = pd.DataFrame(papers)
print(f"Total papers fetched: {len(df)}")

# ========================================
# STEP 2: TEXT PREPROCESSING
# ========================================

df["text"] = (df["title"] + ". " + df["abstract"]).str.lower()

# ========================================
# STEP 3: TF-IDF
# ========================================

vectorizer = TfidfVectorizer(
    max_features=200,
    stop_words="english",
    min_df=2,
    max_df=0.8
)

tfidf_matrix = vectorizer.fit_transform(df["text"])

# ========================================
# STEP 4: OPTIMAL K
# ========================================

K_range = range(2, 31)
silhouette_scores = []
inertias = []

for k in K_range:
    model = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = model.fit_predict(tfidf_matrix)
    score = silhouette_score(tfidf_matrix, labels)
    inertia = model.inertia_
    inertias.append(inertia)
    print(f"k={k}: Silhouette={score:.3f}, Inertia={inertia:.2f}")
    silhouette_scores.append(score)

cluster_analysis = pd.DataFrame({
    'k': list(K_range),
    'inertia': inertias,
    'silhouette': silhouette_scores
}).to_csv("cluster_analysis.csv", index=False)

best_k = 19
print(f"Optimal number of clusters: {best_k}")

# ========================================
# STEP 5: FINAL K-MEANS
# ========================================

kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(tfidf_matrix)

# ========================================
# STEP 6: INTERPRET CLUSTERS
# ========================================

feature_names = vectorizer.get_feature_names_out()
cluster_topics = {}

for i in range(best_k):
    center = kmeans.cluster_centers_[i]
    top_indices = center.argsort()[-10:][::-1]
    cluster_topics[i] = [feature_names[j] for j in top_indices]

df["topic"] = df["cluster"].map(
    lambda c: f"{cluster_topics[c][0].title()}-{cluster_topics[c][1].title()}"
)

# ========================================
# STEP 7: SIMULATE CITATIONS
# ========================================

current_year = 2025
df["age"] = current_year - df["year"]
df["citations"] = (df["age"] * 250 + (df.index % 100) * 10).clip(50, 3000)

df["paper_id"] = range(1, len(df) + 1)

# ========================================
# STEP 8: PCA
# ========================================

pca = PCA(n_components=2, random_state=42)
coords = pca.fit_transform(tfidf_matrix.toarray())

df["pca_x"] = coords[:, 0]
df["pca_y"] = coords[:, 1]

# ========================================
# SAVE FILES
# ========================================

df[[
    "paper_id", "title", "topic", "citations",
    "year", "cluster", "arxiv_id", "pca_x", "pca_y"
]].to_csv("ml_papers_real.csv", index=False)

print("Saved ml_papers_real.csv")


CC10 - REAL Machine Learning: Topic Discovery with K-Means
Fetching papers from 2015-2016...
Fetching papers from 2018-2019...
Fetching papers from 2022-2023...
Fetching papers from 2024-2024...
Total papers fetched: 1000
k=2: Silhouette=0.019, Inertia=885.66
k=3: Silhouette=0.018, Inertia=871.18
k=4: Silhouette=0.024, Inertia=857.66
k=5: Silhouette=0.022, Inertia=847.28
k=6: Silhouette=0.025, Inertia=837.03
k=7: Silhouette=0.030, Inertia=828.03
k=8: Silhouette=0.030, Inertia=818.82
k=9: Silhouette=0.036, Inertia=809.95
k=10: Silhouette=0.034, Inertia=804.55
k=11: Silhouette=0.040, Inertia=795.10
k=12: Silhouette=0.036, Inertia=792.92
k=13: Silhouette=0.042, Inertia=782.03
k=14: Silhouette=0.042, Inertia=778.67
k=15: Silhouette=0.042, Inertia=771.40
k=16: Silhouette=0.042, Inertia=767.39
k=17: Silhouette=0.044, Inertia=763.46
k=18: Silhouette=0.046, Inertia=755.71
k=19: Silhouette=0.047, Inertia=752.75
k=20: Silhouette=0.046, Inertia=751.14
k=21: Silhouette=0.049, Inertia=743.51
k=22: 