In [1]:
import requests
import pandas as pd
import io
from urllib.parse import quote # <-- Import the 'quote' function

# This is the base JSON payload, used by all three functions.
BASE_PAYLOAD = {
    "track_total_hits": True,
    "size": 2000,
    "from": 0,
    "_source": ["abstract", "agent", "applicant", "application_reference.date", "application_reference.doc_number", "application_reference.kind", "application_reference.jurisdiction", "assistant_examiner", "cited_by.patent_count", "cited_by.patent.lens_id", "cites_patent", "claim", "class_cpc", "class_ipcr", "class_national", "date_published", "doc_key", "doc_number", "earliest_priority_claim_date", "examiner", "family.extended.id", "family.extended.size", "family.simple", "family.simple.id", "family.simple.size", "has_abstract", "has_claim", "has_description", "has_docdb", "has_examiner", "has_full_text", "has_title", "inventor", "jurisdiction", "kind", "legal_status", "lens_internal.legacy_pub_key", "npl_citation_count", "owner_all", "patent_citation_count", "primary_examiner", "priority_claim", "publication_type", "record_lens_id", "reference_cited.npl_count", "reference_cited.npl_resolved_count", "reference_cited.patent_count", "reference_cited.patent.lens_id", "sequence", "title"],
    "highlight": {
        "type": "plain",
        "pre_tags": ["<span class=\"highlight\">"],
        "post_tags": ["</span>"],
        "fields": {
            "title": {"fragment_size": 500},
            "fulltext": {},
            "claim": {},
            "description": {},
            "abstract": {}
        },
        "number_of_fragments": 3
    },
    "sort": [{"_score": {"order": "desc"}}],
    "sortField": "_score",
    "sortOrder": "DESC",
    "format": "CSV",
    "fields": ["JURISDICTION", "KIND", "DISPLAY_KEY", "LENS_ID", "PUBLICATION_DATE", "PUBLICATION_YEAR", "APPLICATION_NUMBER", "APPLICATION_DATE", "PRIORITY_NUMBERS", "EARLIEST_PRIORITY_DATE", "TITLE", "ABSTRACT", "APPLICANTS", "INVENTORS", "OWNERS", "URL", "PUBLICATION_TYPE", "HAS_FULL_TEXT", "CITES_PATENT_COUNT", "CITED_BY_PATENT_COUNT", "SIMPLE_FAMILY_SIZE", "SIMPLE_FAMILY_MEMBER_LENS_IDS", "SIMPLE_FAMILY_MEMBER_JURISDICTIONS", "EXTENDED_FAMILY_SIZE", "EXTENDED_FAMILY_MEMBER_LENS_IDS", "EXTENDED_FAMILY_MEMBER_JURISDICTIONS", "SEQUENCE_COUNT", "CPC_CLASSIFICATIONS", "IPCR_CLASSIFICATIONS", "US_CLASSIFICATIONS", "NPL_CITATION_COUNT", "NPL_RESOLVED_CITATION_COUNT", "NPL_RESOLVED_LENS_IDS", "NPL_RESOLVED_EXTERNAL_IDS", "NPL_CITATIONS", "PATENT_STATUS"],
    "filename": "lens-export",
    "async": False
}

def export_expand_by_simple_family(search_term: str) -> pd.DataFrame:
    """
    Fetches patent data expanded by simple family.
    The search_term is URL-encoded to handle spaces and special characters.
    """
    print(f"--- Fetching 'Expand by Simple Family' for: '{search_term}' ---")
    # --- URL Encoding Added Here ---
    encoded_term = quote(search_term)
    url = f"https://www.lens.org/lens/export/patent?q={encoded_term}&st=true&e=true&f=false&l=en"
    
    try:
        response = requests.post(url, json=BASE_PAYLOAD, timeout=30)
        response.raise_for_status()
        return pd.read_csv(io.StringIO(response.text))
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

def export_group_by_simple_family(search_term: str) -> pd.DataFrame:
    """
    Fetches patent data grouped by simple family.
    The search_term is URL-encoded to handle spaces and special characters.
    """
    print(f"\n--- Fetching 'Group by Simple Family' for: '{search_term}' ---")
    # --- URL Encoding Added Here ---
    encoded_term = quote(search_term)
    url = f"https://www.lens.org/lens/export/patent?q={encoded_term}&st=true&e=false&f=true&l=en"
    
    try:
        response = requests.post(url, json=BASE_PAYLOAD, timeout=30)
        response.raise_for_status()
        return pd.read_csv(io.StringIO(response.text))
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

def export_expand_by_extended_family(search_term: str) -> pd.DataFrame:
    """
    Fetches patent data expanded by extended family.
    The search_term is URL-encoded to handle spaces and special characters.
    """
    print(f"\n--- Fetching 'Expand by Extended Family' for: '{search_term}' ---")
    # --- URL Encoding Added Here ---
    encoded_term = quote(search_term)
    url = f"https://www.lens.org/lens/export/patent?q={encoded_term}&st=true&e=false&expandByExtendedFamily=true&f=false&l=en"
    
    try:
        response = requests.post(url, json=BASE_PAYLOAD, timeout=30)
        response.raise_for_status()
        return pd.read_csv(io.StringIO(response.text))
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

# --- Example Usage ---
if __name__ == "__main__":
    search_query = "Cytonics CORP"
    
    # Create a safe filename from the search query
    # safe_filename_part = search_query.replace(" ", "_").replace("/", "-")

    # 1. Get and Save the "Expand by Simple Family" results
    df_expanded_simple = export_expand_by_simple_family(search_query)
    if df_expanded_simple is not None:
        print(f"Result: {df_expanded_simple.shape[0]} rows, {df_expanded_simple.shape[1]} columns.")
        expanded_filename = f"lens-export-simple-expanded.csv"
        df_expanded_simple.to_csv(expanded_filename, index=False)
        print(f"✅ Successfully saved simple expanded data to '{expanded_filename}'")
    
    # 2. Get and Save the "Group by Simple Family" results
    df_grouped_simple = export_group_by_simple_family(search_query)
    if df_grouped_simple is not None:
        print(f"Result: {df_grouped_simple.shape[0]} rows, {df_grouped_simple.shape[1]} columns.")
        grouped_filename = f"lens-export-simple-grouped.csv"
        df_grouped_simple.to_csv(grouped_filename, index=False)
        print(f"✅ Successfully saved simple grouped data to '{grouped_filename}'")

    # 3. Get and Save the "Expand by Extended Family" results
    df_expanded_extended = export_expand_by_extended_family(search_query)
    if df_expanded_extended is not None:
        print(f"Result: {df_expanded_extended.shape[0]} rows, {df_expanded_extended.shape[1]} columns.")
        extended_filename = f"lens-export-extended-family-expanded.csv"
        df_expanded_extended.to_csv(extended_filename, index=False)
        print(f"✅ Successfully saved extended family data to '{extended_filename}'")

--- Fetching 'Expand by Simple Family' for: 'Cytonics CORP' ---
Result: 88 rows, 37 columns.
✅ Successfully saved simple expanded data to 'lens-export-simple-expanded.csv'

--- Fetching 'Group by Simple Family' for: 'Cytonics CORP' ---
Result: 10 rows, 37 columns.
✅ Successfully saved simple grouped data to 'lens-export-simple-grouped.csv'

--- Fetching 'Expand by Extended Family' for: 'Cytonics CORP' ---
Result: 90 rows, 37 columns.
✅ Successfully saved extended family data to 'lens-export-extended-family-expanded.csv'


In [2]:
import os
import pandas as pd
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from tqdm import tqdm

# --- Configuration ---
os.environ["GOOGLE_API_KEY"] = "KEY"  # Replace with your actual key

# The original search query used to generate the CSV. This is crucial for the ranking step.
SEARCH_QUERY = "Cytonics CORP" 

# How many of the most relevant patents you want in the final output.
K_MOST_RELEVANT = 20

# --- Step 1: Load the data ---
print("--- Step 1: Loading patent data from CSV ---")
try:
    df = pd.read_csv("lens-export-simple-grouped.csv")
except FileNotFoundError:
    print("Error: 'lens-export-simple-grouped.csv' not found. Please run the previous script first.")
    exit()

# Clean column names and prepare data
df.columns = df.columns.str.strip()
df_source = df[["Lens ID", "Title", "Abstract"]].dropna().reset_index(drop=True)
print(f"Loaded {len(df_source)} patents to process.")

# --- Step 2: Single-Pass Filtering and Scoring ---
print(f"\n--- Step 2: Processing patents in batches to filter and score relevance to '{SEARCH_QUERY}' ---")

def format_for_prompt(df_batch):
    """Formats a DataFrame batch into a list of dictionaries for the prompt."""
    return df_batch.to_dict(orient='records')

# New, combined prompt for filtering AND scoring in one go
prompt = ChatPromptTemplate.from_template("""
You are a highly efficient patent analyst specializing in biotech and pharmaceuticals.
Your task is to analyze a list of patents based on a given SEARCH QUERY.

Follow these two steps for each patent in the list:
1.  **Filter**: First, determine if the patent is relevant to the medical, pharmaceutical, or biotech fields. If it is NOT, ignore it completely.
2.  **Score**: If the patent IS biomedical, assign a relevance score from 1 (low relevance) to 10 (high relevance) based on how closely its title and abstract match the SEARCH QUERY and for humans.

Return a JSON list of objects. Each object must contain the 'lens_id' and 'relevance_score'.
Only include patents in your output that you identified as biomedical.

SEARCH QUERY: "{search_query}"

PATENT DATA:
{examples}
""")

# Setup LangChain components
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0)
parser = JsonOutputParser()
chain = prompt | llm | parser

# Run the single-pass process
batch_size = 40  # You can adjust this based on performance
all_scored_patents = []

for i in tqdm(range(0, len(df_source), batch_size), desc="Processing Batches (Filter & Score)"):
    batch_df = df_source.iloc[i:i+batch_size]
    examples = format_for_prompt(batch_df)

    try:
        # The result will be a list of scored patents, e.g., [{'lens_id': '...', 'relevance_score': 8}, ...]
        scored_batch = chain.invoke({"search_query": SEARCH_QUERY, "examples": examples})
        if scored_batch:  # Ensure the result is not empty
            all_scored_patents.extend(scored_batch)
    except Exception as e:
        print(f"\nWarning: Error processing a batch, skipping. Details: {e}")

print(f"\nProcessing complete. Found and scored {len(all_scored_patents)} biomedical patents.")

# --- Step 3: Rank Locally and Save the Top K Results ---
if all_scored_patents:
    print(f"\n--- Step 3: Sorting results and selecting the top {K_MOST_RELEVANT} patents ---")

    # Convert the list of scored patents into a DataFrame
    scored_df = pd.DataFrame(all_scored_patents)

    # Sort by relevance_score in descending order to find the best matches
    ranked_df = scored_df.sort_values(by='relevance_score', ascending=False)
    
    # Get the Lens IDs of the top K patents
    top_k_ids = ranked_df.head(K_MOST_RELEVANT)['lens_id'].tolist()
    
    # Create the final output DataFrame by merging with original data to get all info
    final_df = df_source[df_source['Lens ID'].isin(top_k_ids)].copy()
    
    # Add the relevance scores to the final output
    final_df = final_df.merge(ranked_df, left_on='Lens ID', right_on='lens_id', how='left')
    
    # Sort the final DataFrame according to the rank and clean up columns
    final_df['Lens ID'] = pd.Categorical(final_df['Lens ID'], categories=top_k_ids, ordered=True)
    final_df = final_df.sort_values('Lens ID').drop(columns=['lens_id']).reset_index(drop=True)

    # Save the result
    output_filename = f"filtered_relevant_patents.csv"
    final_df.to_csv(output_filename, index=False)

    print(f"\n--- Final Results ---")
    print(final_df)
    print(f"\n✅ Success! Top {K_MOST_RELEVANT} ranked patents saved to '{output_filename}'")
else:
    print("\nNo biomedical patents were found during processing.")

  from .autonotebook import tqdm as notebook_tqdm


--- Step 1: Loading patent data from CSV ---
Loaded 7 patents to process.

--- Step 2: Processing patents in batches to filter and score relevance to 'Cytonics CORP' ---


Processing Batches (Filter & Score): 100%|██████████| 1/1 [00:03<00:00,  3.42s/it]



Processing complete. Found and scored 5 biomedical patents.

--- Step 3: Sorting results and selecting the top 20 patents ---

--- Final Results ---
               Lens ID                                              Title  \
0  014-920-934-512-545  SYSTEMS, COMPOSITIONS, AND METHODS FOR TRANSPL...   
1  103-948-931-946-410  Therapeutic variant alpha-2-macroglobulin comp...   
2  176-912-377-055-867  Systems, compositions, and methods for transpl...   
3  086-963-205-553-664  Method for diagnosing and treating acute joint...   
4  103-256-101-079-605  METHODS FOR DIAGNOSING AND TREATING PAIN IN TH...   

                                            Abstract  relevance_score  
0  Systems and methods for purification and conce...                6  
1  A2M polypeptide compositions containing a non-...                6  
2  Systems and methods for purification and conce...                6  
3  The present invention provides methods, reagen...                6  
4  The present invention pr

In [3]:
import pandas as pd
import requests
import io
from tqdm import tqdm # Optional: for a nice progress bar
import time

# ==============================================================================
#  API FUNCTION
# ==============================================================================

import time

def fetch_lens_minimal_data(lens_id: str) -> dict:
    """
    Fetches key patent data from Lens.org for a given Lens ID,
    with a retry mechanism for handling 429 rate-limiting errors.
    """
    url = f"https://www.lens.org/lens/export/patent?q=lens_id%3A%22{lens_id}%22&st=true"
    
    # --- Retry Logic ---
    max_retries = 4
    base_delay_seconds = 4 # The starting wait time

    for attempt in range(max_retries):
        try:
            response = requests.post(url, timeout=20)
            
            # Check for specific HTTP errors like 429
            response.raise_for_status() 

            # --- Success Case ---
            if response.text and len(response.text.splitlines()) > 1:
                df = pd.read_csv(io.StringIO(response.text))
                if not df.empty:
                    row = df.iloc[0]
                    return {
                        "Jurisdiction": row.get("Jurisdiction", "N/A"),
                        "Application Number": row.get("Application Number", "N/A"),
                        "Document Type": row.get("Document Type", "N/A"),
                        "Legal Status": row.get("Legal Status", "N/A")
                    }
            return {
                "Jurisdiction": "Not Found", "Application Number": "Not Found",
                "Document Type": "Not Found", "Legal Status": "Not Found"
            }

        except requests.exceptions.HTTPError as e:
            # Check if the error is specifically 'Too Many Requests'
            if e.response.status_code == 429:
                if attempt < max_retries - 1:
                    # Calculate wait time with exponential backoff + a little randomness
                    wait_time = (base_delay_seconds ** (attempt + 1)) 
                    print(f"⏳ Rate limit hit for {lens_id}. Retrying in {wait_time} seconds... (Attempt {attempt + 1}/{max_retries})")
                    time.sleep(wait_time)
                else:
                    print(f"❌ Max retries reached for {lens_id}. Giving up.")
                    return {
                        "Jurisdiction": "Error (429)", "Application Number": "Error (429)",
                        "Document Type": "Error (429)", "Legal Status": "Error (429)"
                    }
            else:
                # Handle other HTTP errors (e.g., 404, 500)
                print(f"❌ Unhandled HTTP Error for {lens_id}: {e}")
                return {
                    "Jurisdiction": f"Error ({e.response.status_code})", "Application Number": f"Error ({e.response.status_code})",
                    "Document Type": f"Error ({e.response.status_code})", "Legal Status": f"Error ({e.response.status_code})"
                }
        except requests.exceptions.RequestException as e:
            # Handle network errors (e.g., timeout, connection error)
            print(f"❌ Network Error fetching data for {lens_id}: {e}")
            break # No point in retrying if network is down

    # This is returned if the loop finishes due to a network error or other unhandled case
    return {
        "Jurisdiction": "Error (API)", "Application Number": "Error (API)",
        "Document Type": "Error (API)", "Legal Status": "Error (API)"
    }

# ==============================================================================
#  MAIN SCRIPT LOGIC
# ==============================================================================

print("--- Step 1: Loading local CSV files ---")
group_df = pd.read_csv("lens-export-simple-grouped.csv")
extend_df = pd.read_csv("lens-export-simple-expanded.csv")
relevant_df = pd.read_csv("filtered_relevant_patents.csv")

# Clean column names by removing leading/trailing whitespace
group_df.columns = group_df.columns.str.strip()
extend_df.columns = extend_df.columns.str.strip()
print("CSVs loaded and columns cleaned.")

print("\n--- Step 2: Preparing relevant Lens IDs ---")
relevant_lens_ids = set(relevant_df["Lens ID"])
print(f"Found {len(relevant_lens_ids)} unique relevant Lens IDs.")

print("\n--- Step 3: Filtering group data for relevant patents ---")
relevant_rows = group_df[group_df["Lens ID"].isin(relevant_lens_ids)].copy()
print(f"Found {len(relevant_rows)} matching patents in the 'group-by' data.")

print("\n--- Step 4: Processing patents and their family members ---")
records = []

# Using tqdm for a progress bar
for _, row in tqdm(relevant_rows.iterrows(), total=relevant_rows.shape[0], desc="Processing Patents"):
    lens_id = row["Lens ID"]
    jurisdiction = row.get("Jurisdiction", "N/A")
    app_number = row.get("Application Number", "N/A")
    doc_type = row.get("Document Type", "N/A")
    legal_status = row.get("Legal Status", "N/A")
    family_members_raw = row.get("Simple Family Members", "")

    # Family members are ;; separated
    family_members = str(family_members_raw).split(";;") if pd.notna(family_members_raw) else []

    for member_id in family_members:
        member_id = member_id.strip()
        if not member_id:
            continue

        # The parent patent itself is listed in its family members, so handle it.
        if lens_id == member_id:
            member_data = {
                "Jurisdiction": "no family",
                "Application Number": "no family",
                "Document Type": "no family",
                "Legal Status": "no family"
            }
        else:
            # Try to find in local extend_df
            match = extend_df[extend_df["Lens ID"] == member_id]
            if not match.empty:
                member_data = {
                    "Jurisdiction": match.iloc[0].get("Jurisdiction", "N/A"),
                    "Application Number": match.iloc[0].get("Application Number", "N/A"),
                    "Document Type": match.iloc[0].get("Document Type", "N/A"),
                    "Legal Status": match.iloc[0].get("Legal Status", "N/A")
                }
            else:
                # Not found locally — fetch via API
                print(f"🔎 Fetching from Lens API: {member_id}")
                member_data = fetch_lens_minimal_data(member_id)

        # Save the relationship record
        records.append({
            "Lens ID": lens_id,
            "Jurisdiction": jurisdiction,
            "Document Type": doc_type,
            "Legal Status": legal_status,
            "Application Number": app_number,
            "Family Member Lens ID": member_id,
            "Member Jurisdiction": member_data["Jurisdiction"],
            "Member Application Number": member_data["Application Number"],
            "Member Document Type": member_data["Document Type"],
            "Member Legal Status": member_data["Legal Status"]
        })

print("\n--- Step 5: Saving results to CSV ---")
output_df = pd.DataFrame(records)
output_df.to_csv("simple_patent_family.csv", index=False)
print("✅ Saved: simple_patent_family.csv")
print(f"Total family member relationships traced: {len(output_df)}")

--- Step 1: Loading local CSV files ---
CSVs loaded and columns cleaned.

--- Step 2: Preparing relevant Lens IDs ---
Found 5 unique relevant Lens IDs.

--- Step 3: Filtering group data for relevant patents ---
Found 5 matching patents in the 'group-by' data.

--- Step 4: Processing patents and their family members ---


Processing Patents: 100%|██████████| 5/5 [00:00<00:00, 185.51it/s]


--- Step 5: Saving results to CSV ---
✅ Saved: simple_patent_family.csv
Total family member relationships traced: 51





In [4]:
import pandas as pd

# ---------- config ----------
INFILE = "simple_patent_family.csv"
OUTFILE = "lens_patent_data_simple_family.csv"
SEP = "||"   # change to "|" or "||" as you want
# ----------------------------

# read (keep empty cells as empty strings)
df = pd.read_csv(INFILE, sep=",", dtype=str, encoding="utf-8-sig", keep_default_na=False)
df.columns = df.columns.str.strip()

# normalize 'no family' (case-insensitive, trim spaces)
df = df.replace(to_replace=r'^\s*no family\s*$', value='', regex=True)

# required columns
required = [
    "Lens ID", "Jurisdiction", "Document Type", "Legal Status", "Application Number",
    "Family Member Lens ID", "Member Jurisdiction", "Member Application Number",
    "Member Document Type", "Member Legal Status"
]
missing = [c for c in required if c not in df.columns]
if missing:
    raise KeyError(f"Missing columns in input CSV: {missing}\nAvailable columns: {df.columns.tolist()}")

pat_keys = ["Lens ID", "Jurisdiction", "Document Type", "Legal Status", "Application Number"]

# Get all unique patents (so even patents with only self-rows will be included)
unique_pats = df[pat_keys].drop_duplicates().reset_index(drop=True)

rows = []
for _, pat in unique_pats.iterrows():
    lens = pat["Lens ID"]

    # Select family rows for this patent BUT skip:
    #  - rows where Family Member Lens ID is empty
    #  - rows where Family Member Lens ID equals the parent Lens ID (self-reference)
    mask = (
        (df["Lens ID"] == lens) &
        (df["Family Member Lens ID"].str.strip() != "") &
        (df["Family Member Lens ID"] != df["Lens ID"])
    )
    fam = df.loc[mask, [
        "Family Member Lens ID",
        "Member Jurisdiction",
        "Member Application Number",
        "Member Document Type",
        "Member Legal Status"
    ]].copy()

    # preserve original order (optional)
    fam = fam.sort_index()

    # fill NaNs with empty strings (shouldn't be needed given keep_default_na=False, but safe)
    fam = fam.fillna("")

    if not fam.empty:
        fm_lens = SEP.join(fam["Family Member Lens ID"].astype(str))
        fm_jur = SEP.join(fam["Member Jurisdiction"].astype(str))
        fm_app = SEP.join(fam["Member Application Number"].astype(str))
        fm_doc = SEP.join(fam["Member Document Type"].astype(str))
        fm_status = SEP.join(fam["Member Legal Status"].astype(str))
    else:
        fm_lens = fm_jur = fm_app = fm_doc = fm_status = ""

    row = {
        **pat.to_dict(),
        "Family Member Lens IDs": fm_lens,
        "Family Member Jurisdictions": fm_jur,
        "Family Member Application Numbers": fm_app,
        "Family Member Document Types": fm_doc,
        "Family Member Legal Statuses": fm_status
    }
    rows.append(row)

final_df = pd.DataFrame(rows, columns = pat_keys + [
    "Family Member Lens IDs",
    "Family Member Jurisdictions",
    "Family Member Application Numbers",
    "Family Member Document Types",
    "Family Member Legal Statuses"
])

final_df.to_csv(OUTFILE, index=False)
print(f"✅ Saved grouped file to: {OUTFILE}")
print(f"Total patents exported: {len(final_df)}")

# --- Optional alignment check (prints rows where token counts differ) ---
fam_cols = [
    "Family Member Lens IDs",
    "Family Member Jurisdictions",
    "Family Member Application Numbers",
    "Family Member Document Types",
    "Family Member Legal Statuses"
]

def token_count(cell):
    s = str(cell)
    if s == "" or s.strip() == "":
        return 0
    return len(s.split(SEP))

bad_rows = []
for i, r in final_df.iterrows():
    counts = [token_count(r[c]) for c in fam_cols]
    # If not all counts are equal (and not all zero) -> misalignment
    nonzeros = [c for c in counts if c != 0]
    if nonzeros and (len(set(nonzeros)) != 1):
        bad_rows.append((i, counts))

if bad_rows:
    print("\n⚠️ Alignment warnings (rows where family-column token counts disagree):")
    for i, counts in bad_rows[:20]:
        print(f" row {i}: counts = {counts}  (Lens ID = {final_df.at[i,'Lens ID']})")
    if len(bad_rows) > 20:
        print(f"...and {len(bad_rows)-20} more")
else:
    print("✅ All family columns aligned (token counts match per row).")


✅ Saved grouped file to: lens_patent_data_simple_family.csv
Total patents exported: 5
✅ All family columns aligned (token counts match per row).


In [7]:
import pandas as pd
from pyvis.network import Network

# --- Load Data ---
df = pd.read_csv("lens_patent_data_simple_family.csv")

# Replace NaN with default text to prevent render issues
df.fillna("Unknown", inplace=True)

# Get first 20 unique application numbers
unique_apps = df["Application Number"].dropna().unique()[:20]

# Create the network
net = Network(height="800px", width="100%", bgcolor="#ffffff", directed=True, notebook=False)

# Add root node
net.add_node("patent", label="Patent Root", shape='box', color='red')

# Add first-layer nodes and family member nodes
for app_no in unique_apps:
    app_df = df[df["Application Number"] == app_no]
    meta = app_df.iloc[0]

    # Add main application node
    hover_text = f"Jurisdiction: {meta['Jurisdiction']}<br>Type: {meta['Document Type']}<br>Status: {meta['Legal Status']}"
    net.add_node(app_no, label=app_no, title=hover_text, shape='ellipse', color='orange')
    net.add_edge("patent", app_no)

    # Add family member nodes
    member_apps = meta["Family Member Application Numbers"]
    member_jurisdictions = meta["Family Member Jurisdictions"]
    member_docs = meta["Family Member Document Types"]
    member_statuses = meta["Family Member Legal Statuses"]

    if member_apps != "Unknown" and member_apps != "no family":
        member_apps_list = member_apps.split('||')
        member_jurisdictions_list = member_jurisdictions.split('||')
        member_docs_list = member_docs.split('||')
        member_statuses_list = member_statuses.split('||')

        # To handle cases where lists might have different lengths due to malformed data, we take the minimum length
        min_length = min(len(member_apps_list), len(member_jurisdictions_list), len(member_docs_list), len(member_statuses_list))

        for i in range(min_length):
            member_app = member_apps_list[i]
            member_jurisdiction = member_jurisdictions_list[i]
            member_doc = member_docs_list[i]
            member_status = member_statuses_list[i]

            # Skip the main application itself if it's in the family list
            if member_app == app_no:
                continue

            hover_text = f"Jurisdiction: {member_jurisdiction}<br>Type: {member_doc}<br>Status: {member_status}"
            # Check if node already exists before adding
            if member_app not in net.get_nodes():
                net.add_node(member_app, label=member_app, title=hover_text, shape='ellipse', color='lightblue')
            net.add_edge(app_no, member_app)

# Write HTML manually
net.write_html("patent_family_tree_Cytonics_CORP.html")

print("✅ Tree saved as: patent_family_tree_Cytonics_CORP.html")

✅ Tree saved as: patent_family_tree_Cytonics_CORP.html
