In [None]:
!pip -q install feedparser scipy

In [None]:
# ================================================================
#  arXiv   L L M / G P T 関連論文　投稿動向比較
#          Period A : 2025-04-04 – 2025-05-03
#          Period B : 2024-04-04 – 2024-05-03
# ================================================================
!pip -q install feedparser python-dateutil

import math, sys, datetime as dt, itertools, requests, feedparser
from urllib.parse import quote
from dateutil import parser as dtp
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 120

# ---------- 1.  query parameters --------------------------------
KEYWORDS = ['GPT', 'LLM', '"Generative AI"']        # 検索キーワード
CATEGORIES = ['cs.CL', 'cs.AI']                     # arXiv カテゴリ
MAX_PER_CALL = 2000                                 # API 1 回の最大取得件数

PERIOD_A = (dt.date(2025, 4, 4), dt.date(2025, 5, 3))
PERIOD_B = (dt.date(2024, 4, 4), dt.date(2024, 5, 3))

# ---------- 2.  helper functions --------------------------------
def build_query(start: dt.date, end: dt.date) -> str:
    """
    arXiv API 用の search_query 文字列を URL エンコードして返す。
    """
    kw_part  = ' OR '.join(f'all:{k}' for k in KEYWORDS)
    cat_part = ' OR '.join(f'cat:{c}' for c in CATEGORIES)

    # end_date は上限を含めたいので翌日 00:00:00 まで開区間
    date_part = (f'submittedDate:[{start.strftime("%Y%m%d%H%M%S")} TO '
                 f'{(end + dt.timedelta(days=1)).strftime("%Y%m%d%H%M%S")}]')

    raw = f'({kw_part}) AND ({cat_part}) AND {date_part}'
    return quote(raw, safe='')       # 空白も含めて完全エンコード

def harvest(start: dt.date, end: dt.date):
    """
    指定期間の論文をすべて harvest し、published 日付の list を返す。
    """
    q = build_query(start, end)
    idx, dates = 0, []

    print(f'  fetching {start} – {end} … ', end=''); sys.stdout.flush()
    while True:
        url = (f'https://export.arxiv.org/api/query?'
               f'search_query={q}&start={idx}&max_results={MAX_PER_CALL}&'
               f'sortBy=submittedDate&sortOrder=ascending')
        feed = feedparser.parse(url)
        if not feed.entries:
            break
        dates += [dtp.parse(e.published).date() for e in feed.entries]
        idx   += MAX_PER_CALL
    print(f'done  ({len(dates)} entries)')
    return dates

def to_daily_series(date_list, start: dt.date, end: dt.date) -> pd.Series:
    """
    日毎カウント (pd.Series) へ変換。欠落日は 0 で埋める。
    """
    s = (pd.Series(1, index=pd.to_datetime(date_list))
           .resample('D').sum())
    idx = pd.date_range(start, end, freq='D')
    return s.reindex(idx, fill_value=0)

# ---------- 3.  data download & aggregation ---------------------
dates_A = harvest(*PERIOD_A)
dates_B = harvest(*PERIOD_B)

ser_A = to_daily_series(dates_A, *PERIOD_A)
ser_B = to_daily_series(dates_B, *PERIOD_B)

n_days = len(ser_A)                                   # 30 日

S_A, S_B = ser_A.sum(), ser_B.sum()
lam_A, lam_B = S_A / n_days, S_B / n_days

print('\n=== descriptive statistics ===')
print(f'S_A = {S_A:4d}  ⇒  λ̂_A = {lam_A:6.2f} /day')
print(f'S_B = {S_B:4d}  ⇒  λ̂_B = {lam_B:6.2f} /day')

# ---------- 4.  likelihood-ratio test (Poisson rates) -----------
lam_pool = (S_A + S_B) / (2 * n_days)                 # 帰無仮説下の共通率

def safe_term(S):
    """0 log 0 問題を回避"""
    return 0.0 if S == 0 else S * math.log(S / (n_days * lam_pool))

G = 2.0 * (safe_term(S_A) + safe_term(S_B))
p_val = stats.chi2.sf(G, df=1)

print('\n=== LRT  Poisson rates ===')
print(f'G = {G:8.3f},   p-value = {p_val:.3e}')
print('→', '差が **有意あり**' if p_val < 0.05 else '差は **有意ではない**')

# ---------- 5.  quick visualisation ------------------------------
plt.figure(figsize=(10, 4))
plt.plot(ser_B.index, ser_B, label='2024-04-04 – 05-03', lw=2)
plt.plot(ser_A.index, ser_A, label='2025-04-04 – 05-03', lw=2)
plt.ylabel('daily submissions')
plt.grid(alpha=.3); plt.legend()
plt.tight_layout(); plt.show()
