get those links which category is "Cases" under urls_dic in RST_Preprocessed_SBS

In [None]:
from pymongo import MongoClient

MONGO_URI = "MONGO_URI"
DB_NAME = "copyright"
SOURCE_COL = "RST_Preprocessed_SBS"

client = MongoClient(MONGO_URI)
db = client[DB_NAME]
src = db[SOURCE_COL]

found = False

cursor = src.find(
    {"urls_dic": {"$exists": True}},
    {"pdf": 1, "opinion_id": 1, "urls_dic": 1},
    no_cursor_timeout=True
)

try:
    for doc in cursor:
        urls = doc.get("urls_dic")
        if not isinstance(urls, list):
            continue

        for u in urls:
            if u.get("category") == "Cases":
                print("✅ FOUND ONE CASE LINK")
                print("PDF:", doc.get("pdf"))
                print("Opinion ID:", doc.get("opinion_id"))
                print("Raw text:", u.get("raw_text"))
                print("Link:", u.get("link"))
                print("Confidence:", u.get("confidence"))
                found = True
                break

        if found:
            break

finally:
    cursor.close()

if not found:
    print("❌ No 'Cases' category found at all.")


✅ FOUND ONE CASE LINK
PDF: cp01.pdf
Opinion ID: 0
Raw text: Washingtonian Pub. Co. v. Pearson, 140 F.2d 465
Link: https://advance.lexis.com/api/document?id=urn:contentItem:3RTW-Y5P0-003B-T310-00000-00&idtype=PID&context=1516831
Confidence: 0.85


create the new data base called case_link which stores all the "Cases" inside

In [None]:
from pymongo import MongoClient, UpdateOne

MONGO_URI = "MONGO_URI"
DB_NAME = "copyright"

client = MongoClient(MONGO_URI)
db = client[DB_NAME]

src = db["RST_Preprocessed_SBS"]
tgt = db["case_link"]

# rebuild (safe; doesn't touch your old collections)
tgt.drop()
tgt.create_index("link", unique=True)

BATCH_SIZE = 2000
ops = []
processed_docs = 0
processed_hits = 0

cursor = src.find(
    {"urls_dic": {"$exists": True}},
    {"urls_dic": 1},  # _id comes by default
    no_cursor_timeout=True
)

try:
    for doc in cursor:
        source_id = doc["_id"]
        urls = doc.get("urls_dic")

        if not isinstance(urls, list):
            continue

        for u in urls:
            if u.get("category") != "Cases":
                continue

            link = u.get("link")
            if not isinstance(link, str) or not link.strip():
                continue
            link = link.strip()

            citation = u.get("raw_text")
            if isinstance(citation, str):
                citation = citation.strip()
            else:
                citation = None

            update = {
                "$addToSet": {"source_ids": source_id}
            }

            # 只有 citation 有值才加入（避免 None 進 array）
            if citation:
                update["$addToSet"]["citations"] = citation

            ops.append(
                UpdateOne(
                    {"link": link},
                    {
                        "$setOnInsert": {"link": link},
                        **update
                    },
                    upsert=True
                )
            )
            processed_hits += 1

            if len(ops) >= BATCH_SIZE:
                tgt.bulk_write(ops, ordered=False)
                ops = []

        processed_docs += 1

    if ops:
        tgt.bulk_write(ops, ordered=False)

finally:
    cursor.close()

print("Processed source docs:", processed_docs)
print("Processed case-link hits:", processed_hits)
print("Distinct links stored:", tgt.count_documents({}))
print("Sample:", tgt.find_one({}, {"_id": 0, "link": 1, "source_ids": {"$slice": 3}, "citations": {"$slice": 5}}))


Processed source docs: 2183
Processed case-link hits: 142221
Distinct links stored: 26894
Sample: {'link': 'https://advance.lexis.com/api/document?id=urn:contentItem:3RTW-Y5P0-003B-T310-00000-00&idtype=PID&context=1516831', 'citations': ['Washingtonian Pub. Co. v. Pearson, 140 F.2d 465'], 'source_ids': [ObjectId('6811b8d1af06ea2cad500b94')]}


In [None]:
import re
from pymongo import MongoClient, UpdateOne

# === connect to the right db and collection ===
MONGO_URI = "MONGO_URI"
DB_NAME = "copyright"
COLLECTION = "case_link"   
# =======================

URN_RE = re.compile(r"(urn:contentItem:[A-Z0-9\-]+)", re.IGNORECASE)

client = MongoClient(MONGO_URI)
col = client[DB_NAME][COLLECTION]

ops = []

for doc in col.find({"urn": {"$exists": False}, "link": {"$type": "string"}}):
    m = URN_RE.search(doc["link"])
    if not m:
        continue

    urn = m.group(1)

    ops.append(
        UpdateOne(
            {"_id": doc["_id"]},
            {"$set": {"urn": urn}}
        )
    )

# Write in with group
if ops:
    result = col.bulk_write(ops)
    print("Modified:", result.modified_count)
else:
    print("Nothing to update.")


Modified: 26894


check if there are the duplicate URN <br>
if yes, then how many?

In [2]:
from collections import Counter
urns = [
    doc["urn"]
    for doc in col.find({"urn": {"$exists": True}}, {"urn": 1})
]

counter = Counter(urns)
duplicates = {u: c for u, c in counter.items() if c > 1}

print("重複的 URN 數量:", len(duplicates))


重複的 URN 數量: 2156


to check how many times the same URN duplicating

In [3]:
for urn, count in duplicates.items():
    print(urn, "→", count)


urn:contentItem:3RTW-Y5P0-003B-T310-00000-00 → 5
urn:contentItem:3RVD-55F0-0054-704P-00000-00 → 3
urn:contentItem:3S4N-82K0-0054-714M-00000-00 → 9
urn:contentItem:3S4P-34S0-0039-S4Y3-00000-00 → 6
urn:contentItem:3S4P-2610-0039-S41N-00000-00 → 4
urn:contentItem:3S4P-1XY0-0039-S3P9-00000-00 → 3
urn:contentItem:3S4N-WS80-0039-S4BN-00000-00 → 2
urn:contentItem:3S4N-VFG0-0039-S313-00000-00 → 7
urn:contentItem:3S4N-TG10-0039-S0JC-00000-00 → 6
urn:contentItem:3S4N-T5H0-0039-S506-00000-00 → 3
urn:contentItem:3S4N-T220-0039-S4KP-00000-00 → 3
urn:contentItem:3S4N-SCR0-0039-S39F-00000-00 → 4
urn:contentItem:3S4X-0GT0-0039-W0C2-00000-00 → 3
urn:contentItem:3S4X-09R0-0039-W022-00000-00 → 7
urn:contentItem:3S4N-N380-0039-S0WF-00000-00 → 5
urn:contentItem:3S4N-MHT0-0039-S55G-00000-00 → 6
urn:contentItem:3S4N-MD80-0039-S4W7-00000-00 → 13
urn:contentItem:3S4W-YG20-0039-W2KK-00000-00 → 15
urn:contentItem:3S4N-JBR0-0039-S23H-00000-00 → 2
urn:contentItem:3S4N-GWG0-0039-S0CJ-00000-00 → 8
urn:contentItem:3S

extract each urn only for once and write the data inside new collection "case_urn"

In [None]:
DB_NAME = "copyright"  
SRC = "case_link"
DST = "case_urn"

client = MongoClient(MONGO_URI)
db = client[DB_NAME]

pipeline = [
    {"$match": {"urn": {"$type": "string", "$ne": ""}}},
    {"$group": {
        "_id": "$urn",
        "link": {"$push": {
            "_id": "$_id",
            "link": "$link",
            "citations": "$citations",
            "source_ids": "$source_ids"
        }},
        "citations_all": {"$push": "$citations"},
        "source_ids_all": {"$push": "$source_ids"},
        "n_docs": {"$sum": 1}
    }},
    {"$project": {
        "_id": 0,
        "urn": "$_id",
        "n_docs": 1,
        "link": 1,
        "citations": {
            "$reduce": {
                "input": "$citations_all",
                "initialValue": [],
                "in": {"$concatArrays": ["$$value", {"$ifNull": ["$$this", []]}]}
            }
        },
        "source_ids": {
            "$reduce": {
                "input": "$source_ids_all",
                "initialValue": [],
                "in": {"$concatArrays": ["$$value", {"$ifNull": ["$$this", []]}]}
            }
        }
    }},
    {"$out": DST}
]

db[SRC].aggregate(pipeline, allowDiskUse=True)
print(f"Done. Wrote collection: {DST}")


Done. Wrote collection: case_urn
