In [1]:
# ==============================================
# FINAL COMPLETE EPO PATENT SEARCH SYSTEM
# Complete solution with all functions integrated
# ==============================================

import requests
from requests.auth import HTTPBasicAuth
import xml.etree.ElementTree as ET
import pandas as pd
import time
from datetime import datetime
import csv
# from collections import Counter

class EPOPatentSearchSystem:
    """
    Complete EPO Patent Search System with INPADOC family data, abstracts, and claims.
    Features:
    - Comprehensive patent search
    - Title, inventors, applicants extraction
    - Abstract and claims retrieval
    - INPADOC extended family data
    - || separator formatting
    - Jurisdiction mapping and family member lists
    """
    
    def __init__(self, consumer_key: str, consumer_secret: str):
        self.consumer_key = consumer_key
        self.consumer_secret = consumer_secret
        self.access_token = None
        self.namespaces = {
            'ops': 'http://ops.epo.org',
            'ep': 'http://www.epo.org/exchange'
        }
    
    def get_access_token(self):
        """Get EPO OPS access token."""
        url = "https://ops.epo.org/3.2/auth/accesstoken"
        data = {"grant_type": "client_credentials"}
        try:
            response = requests.post(
                url, 
                data=data, 
                auth=HTTPBasicAuth(self.consumer_key, self.consumer_secret)
            )
            response.raise_for_status()
            self.access_token = response.json()["access_token"]
            return self.access_token
        except Exception as e:
            print(f"❌ Token error: {e}")
            return None
    
    def _ensure_token(self):
        """Ensure we have a valid access token."""
        if not self.access_token:
            return self.get_access_token()
        return self.access_token
    
    def _convert_to_docdb_format(self, publication_number: str):
        """Convert publication number to DOCDB format."""
        if publication_number.startswith(('US', 'EP', 'WO', 'JP', 'CN')) and len(publication_number) > 2:
            c = publication_number[:2]
            rest = publication_number[2:]
            for i, char in enumerate(rest):
                if char.isalpha():
                    n = rest[:i]
                    k = rest[i:]
                    break
            else:
                n = rest
                k = ""
            return f"{c}.{n}.{k}" if k else f"{c}.{n}"
        return publication_number
    
    def search_patents(self, search_query: str, max_results: int = 25):
        """
        Search for patents using EPO OPS API.
        
        Args:
            search_query: EPO search query string
            max_results: Maximum number of results to return
            
        Returns:
            List of patent dictionaries with basic information
        """
        if not self._ensure_token():
            return []
        
        headers = {"Authorization": f"Bearer {self.access_token}", "Accept": "application/xml"}
        url = f"https://ops.epo.org/3.2/rest-services/published-data/search"
        
        params = {
            "q": search_query,
            "Range": f"1-{max_results}"
        }
        
        try:
            response = requests.get(url, headers=headers, params=params, timeout=30)
            if response.ok:
                root = ET.fromstring(response.content)
                
                patent_refs = root.findall(".//ops:publication-reference", self.namespaces)
                patents_data = []
                
                print(f"🔍 Found {len(patent_refs)} patents. Extracting basic data...")
                
                for i, patent_ref in enumerate(patent_refs, 1):
                    doc_id = patent_ref.find(".//ep:document-id[@document-id-type='docdb']", self.namespaces)
                    if doc_id is not None:
                        country = doc_id.find("ep:country", self.namespaces)
                        number = doc_id.find("ep:doc-number", self.namespaces)
                        kind = doc_id.find("ep:kind", self.namespaces)
                        date = doc_id.find("ep:date", self.namespaces)
                        
                        if all(x is not None and x.text for x in [country, number, kind]):
                            publication_num = f"{country.text}{number.text}{kind.text}"
                            pub_date = date.text if date is not None and date.text else ""
                            
                            # Get title if available
                            title_elem = patent_ref.find(".//ep:invention-title[@lang='en']", self.namespaces)
                            title = title_elem.text if title_elem is not None and title_elem.text else ""
                            
                            patent_data = {
                                'publication_number': publication_num,
                                'country': country.text,
                                'doc_number': number.text,
                                'kind': kind.text,
                                'publication_date': pub_date,
                                'title': title,
                                'search_index': i
                            }
                            
                            patents_data.append(patent_data)
                            
                            if i % 5 == 0:
                                print(f"  Processed {i}/{len(patent_refs)} patents...")
                
                print(f"✅ Extracted basic data for {len(patents_data)} patents")
                return patents_data
                
            else:
                print(f"❌ Search failed: {response.status_code}")
                return []
                
        except Exception as e:
            print(f"❌ Search error: {e}")
            return []
    
    def get_patent_details(self, publication_number: str, retries: int = 3):
        """
        Get detailed patent information including title, inventors, applicants, abstract, and claims.
        
        Args:
            publication_number: Patent publication number
            retries: Number of retry attempts
            
        Returns:
            Dictionary with detailed patent information
        """
        if not self._ensure_token():
            return {}
        
        headers = {"Authorization": f"Bearer {self.access_token}", "Accept": "application/xml"}
        docdb_format = self._convert_to_docdb_format(publication_number)
        
        result = {}
        
        # Get bibliographic data (title, inventors, applicants)
        biblio_url = f"https://ops.epo.org/3.2/rest-services/published-data/publication/docdb/{docdb_format}/biblio"
        
        for attempt in range(retries):
            try:
                response = requests.get(biblio_url, headers=headers, timeout=30)
                if response.ok:
                    root = ET.fromstring(response.content)
                    
                    # Extract title
                    title_elem = root.find(".//ep:invention-title[@lang='en']", self.namespaces)
                    result['title'] = title_elem.text if title_elem is not None else ""
                    
                    # Extract inventors
                    inventors = []
                    inventor_elems = root.findall(".//ep:inventor", self.namespaces)
                    for inv in inventor_elems:
                        name_elem = inv.find(".//ep:name", self.namespaces)
                        if name_elem is not None and name_elem.text:
                            inventors.append(name_elem.text.strip())
                    
                    # Extract applicants
                    applicants = []
                    applicant_elems = root.findall(".//ep:applicant", self.namespaces)
                    for app in applicant_elems:
                        name_elem = app.find(".//ep:name", self.namespaces)
                        if name_elem is not None and name_elem.text:
                            applicants.append(name_elem.text.strip())
                    
                    result.update({
                        'inventors': ' || '.join(inventors) if inventors else '',
                        'applicants': ' || '.join(applicants) if applicants else '',
                        'inventors_count': len(inventors),
                        'applicants_count': len(applicants)
                    })
                    break
                    
                else:
                    print(f"⚠️ Biblio failed for {publication_number}: {response.status_code}")
                    time.sleep(1)
                    
            except Exception as e:
                print(f"⚠️ Biblio error for {publication_number} (attempt {attempt + 1}): {e}")
                time.sleep(1)
        
        # Get abstract
        abstract_url = f"https://ops.epo.org/3.2/rest-services/published-data/publication/docdb/{docdb_format}/abstract"
        
        for attempt in range(retries):
            try:
                response = requests.get(abstract_url, headers=headers, timeout=30)
                if response.ok:
                    root = ET.fromstring(response.content)
                    
                    # Extract abstract
                    abstract_elem = root.find(".//ep:abstract[@lang='en']", self.namespaces)
                    if abstract_elem is not None:
                        # Get all text from abstract, including from paragraphs
                        abstract_texts = []
                        for p in abstract_elem.findall(".//ep:p", self.namespaces):
                            if p.text:
                                abstract_texts.append(p.text.strip())
                        
                        if abstract_texts:
                            result['abstract'] = ' '.join(abstract_texts)
                        elif abstract_elem.text:
                            result['abstract'] = abstract_elem.text.strip()
                        else:
                            result['abstract'] = ""
                    else:
                        result['abstract'] = ""
                    break
                    
                else:
                    result['abstract'] = ""
                    if response.status_code != 404:  # 404 is normal for missing abstracts
                        print(f"⚠️ Abstract failed for {publication_number}: {response.status_code}")
                    break
                    
            except Exception as e:
                result['abstract'] = ""
                break
        
        # Get claims
        claims_url = f"https://ops.epo.org/3.2/rest-services/published-data/publication/docdb/{docdb_format}/claims"
        
        for attempt in range(retries):
            try:
                response = requests.get(claims_url, headers=headers, timeout=30)
                if response.ok:
                    root = ET.fromstring(response.content)
                    
                    # Extract claims
                    claims_texts = []
                    claim_elems = root.findall(".//ep:claim[@lang='en']", self.namespaces)
                    for claim in claim_elems:
                        claim_num = claim.get('num', '')
                        claim_text = ""
                        
                        # Get text from paragraphs within claim
                        for p in claim.findall(".//ep:claim-text", self.namespaces):
                            if p.text:
                                claim_text += p.text.strip() + " "
                        
                        if claim_text.strip():
                            claims_texts.append(f"Claim {claim_num}: {claim_text.strip()}")
                    
                    if claims_texts:
                        result['claims'] = ' || '.join(claims_texts[:5])  # Limit to first 5 claims
                    else:
                        result['claims'] = ""
                    break
                    
                else:
                    result['claims'] = ""
                    if response.status_code != 404:  # 404 is normal for missing claims
                        print(f"⚠️ Claims failed for {publication_number}: {response.status_code}")
                    break
                    
            except Exception as e:
                result['claims'] = ""
                break
        
        # Set defaults for missing data
        default_fields = ['title', 'inventors', 'applicants', 'abstract', 'claims']
        for field in default_fields:
            if field not in result:
                result[field] = ""
        
        if 'inventors_count' not in result:
            result['inventors_count'] = 0
        if 'applicants_count' not in result:
            result['applicants_count'] = 0
        
        return result
    
    def get_inpadoc_family_data(self, publication_number: str, retries: int = 3):
        """
        Get INPADOC extended family data with enhanced formatting.
        
        Args:
            publication_number: Patent publication number
            retries: Number of retry attempts
            
        Returns:
            Dictionary with INPADOC family information
        """
        if not self._ensure_token():
            return {}
        
        headers = {"Authorization": f"Bearer {self.access_token}", "Accept": "application/xml"}
        docdb_format = self._convert_to_docdb_format(publication_number)
        
        url = f"https://ops.epo.org/3.2/rest-services/family/publication/docdb/{docdb_format}"
        
        for attempt in range(retries):
            try:
                response = requests.get(url, headers=headers, timeout=30)
                if response.ok:
                    root = ET.fromstring(response.content)
                    
                    # Get family information
                    family_elem = root.find(".//ops:patent-family", self.namespaces)
                    total_count = family_elem.get('total-result-count', '0') if family_elem is not None else '0'
                    
                    # Get all family members
                    family_members = root.findall(".//ops:family-member", self.namespaces)
                    
                    # Collect jurisdictions and members
                    jurisdiction_list = []
                    all_family_members = []
                    
                    for member in family_members:
                        # publication-reference is in the ep namespace
                        pub_ref = member.find(".//ep:publication-reference", self.namespaces)
                        if pub_ref is not None:
                            # document-id elements are also in ep namespace
                            doc_id = pub_ref.find(".//ep:document-id[@document-id-type='docdb']", self.namespaces)
                            if doc_id is not None:
                                country = doc_id.find("ep:country", self.namespaces)
                                number = doc_id.find("ep:doc-number", self.namespaces)
                                kind = doc_id.find("ep:kind", self.namespaces)
                                date = doc_id.find("ep:date", self.namespaces)
                                
                                if all(x is not None and x.text for x in [country, number, kind]):
                                    country_code = country.text
                                    pub_num = f"{country_code}{number.text}{kind.text}"
                                    pub_date = date.text if date is not None and date.text else ""
                                    
                                    # Add to jurisdiction list for each occurrence
                                    jurisdiction_list.append(country_code)
                                    
                                    all_family_members.append({
                                        'publication': pub_num,
                                        'country': country_code,
                                        'date': pub_date
                                    })
                    
                    # Create jurisdiction string with || separator (showing each occurrence)
                    jurisdiction_string = ' || '.join(jurisdiction_list)
                    
                    # Create family members string with || separator
                    family_members_string = ' || '.join([member['publication'] for member in all_family_members])
                    
                    return {
                        'inpadoc_family_size': len(all_family_members),
                        'inpadoc_jurisdictions_count': len(set(jurisdiction_list)),
                        'inpadoc_jurisdiction_list': jurisdiction_string,
                        'inpadoc_family_members': family_members_string
                    }
                    
                else:
                    print(f"⚠️ Family failed for {publication_number}: {response.status_code}")
                    time.sleep(1)
                    
            except Exception as e:
                print(f"⚠️ Family error for {publication_number} (attempt {attempt + 1}): {e}")
                time.sleep(1)
        
        return {}
    
    def create_comprehensive_dataset(self, search_query: str, max_results: int = 25, output_filename: str = None):
        """
        Create comprehensive patent dataset with all features.
        
        Args:
            search_query: EPO search query string
            max_results: Maximum number of patents to process
            output_filename: Output CSV filename (auto-generated if None)
            
        Returns:
            pandas.DataFrame with comprehensive patent data
        """
        if output_filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_filename = f"comprehensive_patent_data_{timestamp}.csv"
        
        print(f"🚀 COMPREHENSIVE EPO PATENT DATA EXTRACTION")
        print(f"✨ Features: Titles, Inventors, Applicants, Abstracts, Claims, INPADOC Family")
        print(f"🔗 Format: || separators for all multi-value fields")
        print(f"Query: {search_query}")
        print(f"Max results: {max_results}")
        print("=" * 80)
        
        # Step 1: Basic search
        print("📝 Step 1: Basic patent search...")
        basic_patents = self.search_patents(search_query, max_results)
        
        if not basic_patents:
            print("❌ No patents found in basic search")
            return None
        
        # Step 2: Get comprehensive data for each patent
        print(f"\\n📊 Step 2: Getting comprehensive data for {len(basic_patents)} patents...")
        comprehensive_data = []
        
        for i, patent in enumerate(basic_patents, 1):
            pub_num = patent['publication_number']
            print(f"  Processing {i}/{len(basic_patents)}: {pub_num}")
            
            # Start with basic data
            patent_data = patent.copy()
            
            # Get detailed data (title, inventors, applicants, abstract, claims)
            detailed_data = self.get_patent_details(pub_num)
            patent_data.update(detailed_data)
            
            # Get INPADOC family data
            family_data = self.get_inpadoc_family_data(pub_num)
            patent_data.update(family_data)
            
            # Show progress
            abstract_status = "✓" if detailed_data.get('abstract', '') else "✗"
            claims_status = "✓" if detailed_data.get('claims', '') else "✗"
            family_size = family_data.get('inpadoc_family_size', 0)
            
            print(f"    📄 Abstract: {abstract_status} | 📋 Claims: {claims_status} | 👥 Family: {family_size} members")
            
            comprehensive_data.append(patent_data)
            
            # Rate limiting
            time.sleep(0.8)
            
            if i % 3 == 0:
                print(f"    ✅ Completed {i}/{len(basic_patents)} patents")
        
        # Step 3: Create DataFrame and export
        print(f"\\n💾 Step 3: Creating comprehensive CSV export...")
        df = pd.DataFrame(comprehensive_data)
        
        # Define complete column structure
        expected_columns = [
            'publication_number', 'country', 'doc_number', 'kind', 'publication_date',
            'title', 'abstract', 'claims',
            'inventors', 'applicants', 'inventors_count', 'applicants_count',
            'inpadoc_family_size', 'inpadoc_jurisdictions_count', 
            'inpadoc_jurisdiction_list', 'inpadoc_family_members', 'search_index'
        ]
        
        for col in expected_columns:
            if col not in df.columns:
                df[col] = ''
        
        # Reorder columns
        df = df[expected_columns]
        
        # Export to CSV
        try:
            df.to_csv(output_filename, index=False, encoding='utf-8-sig')
            print(f"✅ Successfully exported {len(df)} patents to {output_filename}")
            
            
            # INPADOC family statistics
            patents_with_family = len(df[df['inpadoc_family_size'] > 0])
            print(f"Patents with INPADOC family data: {patents_with_family}")
            
            if len(df) > 0:
                avg_abstract_length = df[df['abstract'] != '']['abstract'].str.len().mean() if len(df[df['abstract'] != '']) > 0 else 0
                avg_family_size = df['inpadoc_family_size'].mean()
                
                print(f"\\n📊 STATISTICS:")
                print(f"Average abstract length: {avg_abstract_length:.0f} characters")
                print(f"Average family size: {avg_family_size:.1f} members")
                
                # Show sample format
                print(f"\\n🎯 SAMPLE || SEPARATOR FORMAT:")
                if len(df) > 0:
                    sample = df.iloc[0]
                    print(f"Patent: {sample['publication_number']}")
                    
                    # Show sample data
                    inventors_sample = str(sample['inventors'])[:100] + "..." if len(str(sample['inventors'])) > 100 else str(sample['inventors'])
                    jurisdictions_sample = str(sample['inpadoc_jurisdiction_list'])[:100] + "..." if len(str(sample['inpadoc_jurisdiction_list'])) > 100 else str(sample['inpadoc_jurisdiction_list'])
                    
                    print(f"Inventors: {inventors_sample}")
                    print(f"Jurisdictions: {jurisdictions_sample}")
            
            return df
            
        except Exception as e:
            print(f"❌ Export error: {e}")
            return df

# ==============================================
# MAIN EXECUTION
# ==============================================

if __name__ == "__main__":
    # EPO OPS API Credentials
    CONSUMER_KEY = "KEY"
    CONSUMER_SECRET = "SECRET"
    
    # Initialize the EPO Patent Search System
    epo_system = EPOPatentSearchSystem(CONSUMER_KEY, CONSUMER_SECRET)
    
   
    # Configuration
    search_query = "Alpha-2-macroglobulin"  # Change this to your desired search
    max_results = 25
    output_file = f"EPO_Patent_Dataset_{search_query}.csv"
    
    
    # Execute comprehensive search
    print(f"\\n🚀 EXECUTING COMPREHENSIVE SEARCH...")
    final_dataset = epo_system.create_comprehensive_dataset(
        search_query=search_query,
        max_results=max_results,
        output_filename=output_file
    )
    
    if final_dataset is not None:
        print(f"\\n🎉 SUCCESS! Comprehensive patent dataset created!")
        
    else:
        print(f"\\n❌ Failed to create dataset. Check search query and API connectivity.")
    


\n🚀 EXECUTING COMPREHENSIVE SEARCH...
🚀 COMPREHENSIVE EPO PATENT DATA EXTRACTION
✨ Features: Titles, Inventors, Applicants, Abstracts, Claims, INPADOC Family
🔗 Format: || separators for all multi-value fields
Query: Alpha-2-macroglobulin
Max results: 25
📝 Step 1: Basic patent search...
🔍 Found 25 patents. Extracting basic data...
  Processed 5/25 patents...
  Processed 10/25 patents...
  Processed 15/25 patents...
  Processed 20/25 patents...
  Processed 25/25 patents...
✅ Extracted basic data for 25 patents
\n📊 Step 2: Getting comprehensive data for 25 patents...
  Processing 1/25: US2025270292A1
    📄 Abstract: ✓ | 📋 Claims: ✗ | 👥 Family: 22 members
  Processing 2/25: US2025205319A1
    📄 Abstract: ✓ | 📋 Claims: ✗ | 👥 Family: 26 members
  Processing 3/25: NZ740279A
    📄 Abstract: ✓ | 📋 Claims: ✗ | 👥 Family: 37 members
    ✅ Completed 3/25 patents
  Processing 4/25: NZ746485A
    📄 Abstract: ✓ | 📋 Claims: ✗ | 👥 Family: 31 members
  Processing 5/25: US2025032683A1
    📄 Abstract: ✓ | 

In [5]:
# ==============================================
# FINAL COMPLETE EPO PATENT SEARCH SYSTEM
# Complete solution with all functions integrated
# ==============================================

import requests
from requests.auth import HTTPBasicAuth
import xml.etree.ElementTree as ET
import pandas as pd
import time
from datetime import datetime
import csv
# from collections import Counter

class EPOPatentSearchSystem:
    """
    Complete EPO Patent Search System with INPADOC family data, abstracts, and claims.
    Features:
    - Comprehensive patent search
    - Title, inventors, applicants extraction
    - Abstract and claims retrieval
    - INPADOC extended family data
    - || separator formatting
    - Jurisdiction mapping and family member lists
    """
    
    def __init__(self, consumer_key: str, consumer_secret: str):
        self.consumer_key = consumer_key
        self.consumer_secret = consumer_secret
        self.access_token = None
        self.namespaces = {
            'ops': 'http://ops.epo.org',
            'ep': 'http://www.epo.org/exchange'
        }
    
    def get_access_token(self):
        """Get EPO OPS access token."""
        url = "https://ops.epo.org/3.2/auth/accesstoken"
        data = {"grant_type": "client_credentials"}
        try:
            response = requests.post(
                url, 
                data=data, 
                auth=HTTPBasicAuth(self.consumer_key, self.consumer_secret)
            )
            response.raise_for_status()
            self.access_token = response.json()["access_token"]
            return self.access_token
        except Exception as e:
            print(f"❌ Token error: {e}")
            return None
    
    def _ensure_token(self):
        """Ensure we have a valid access token."""
        if not self.access_token:
            return self.get_access_token()
        return self.access_token
    
    def _convert_to_docdb_format(self, publication_number: str):
        """Convert publication number to DOCDB format."""
        if publication_number.startswith(('US', 'EP', 'WO', 'JP', 'CN')) and len(publication_number) > 2:
            c = publication_number[:2]
            rest = publication_number[2:]
            for i, char in enumerate(rest):
                if char.isalpha():
                    n = rest[:i]
                    k = rest[i:]
                    break
            else:
                n = rest
                k = ""
            return f"{c}.{n}.{k}" if k else f"{c}.{n}"
        return publication_number
    
    def search_patents(self, search_query: str, max_results: int = 25):
        """
        Search for patents using EPO OPS API.
        
        Args:
            search_query: EPO search query string
            max_results: Maximum number of results to return
            
        Returns:
            List of patent dictionaries with basic information
        """
        if not self._ensure_token():
            return []
        
        headers = {"Authorization": f"Bearer {self.access_token}", "Accept": "application/xml"}
        url = f"https://ops.epo.org/3.2/rest-services/published-data/search"
        
        params = {
            "q": search_query,
            "Range": f"1-{max_results}"
        }
        
        try:
            response = requests.get(url, headers=headers, params=params, timeout=30)
            if response.ok:
                root = ET.fromstring(response.content)
                
                patent_refs = root.findall(".//ops:publication-reference", self.namespaces)
                patents_data = []
                
                print(f"🔍 Found {len(patent_refs)} patents. Extracting basic data...")
                
                for i, patent_ref in enumerate(patent_refs, 1):
                    doc_id = patent_ref.find(".//ep:document-id[@document-id-type='docdb']", self.namespaces)
                    if doc_id is not None:
                        country = doc_id.find("ep:country", self.namespaces)
                        number = doc_id.find("ep:doc-number", self.namespaces)
                        kind = doc_id.find("ep:kind", self.namespaces)
                        date = doc_id.find("ep:date", self.namespaces)
                        
                        if all(x is not None and x.text for x in [country, number, kind]):
                            publication_num = f"{country.text}{number.text}{kind.text}"
                            pub_date = date.text if date is not None and date.text else ""
                            
                            # Get title if available
                            title_elem = patent_ref.find(".//ep:invention-title[@lang='en']", self.namespaces)
                            title = title_elem.text if title_elem is not None and title_elem.text else ""
                            
                            patent_data = {
                                'publication_number': publication_num,
                                'country': country.text,
                                'doc_number': number.text,
                                'kind': kind.text,
                                'publication_date': pub_date,
                                'title': title,
                                'search_index': i
                            }
                            
                            patents_data.append(patent_data)
                            
                            if i % 5 == 0:
                                print(f"  Processed {i}/{len(patent_refs)} patents...")
                
                print(f"✅ Extracted basic data for {len(patents_data)} patents")
                return patents_data
                
            else:
                print(f"❌ Search failed: {response.status_code}")
                return []
                
        except Exception as e:
            print(f"❌ Search error: {e}")
            return []
    
    def get_patent_details(self, publication_number: str, retries: int = 3):
        """
        Get detailed patent information including title, inventors, applicants, abstract, and claims.
        
        Args:
            publication_number: Patent publication number
            retries: Number of retry attempts
            
        Returns:
            Dictionary with detailed patent information
        """
        if not self._ensure_token():
            return {}
        
        headers = {"Authorization": f"Bearer {self.access_token}", "Accept": "application/xml"}
        docdb_format = self._convert_to_docdb_format(publication_number)
        
        result = {}
        
        # Get bibliographic data (title, inventors, applicants)
        biblio_url = f"https://ops.epo.org/3.2/rest-services/published-data/publication/docdb/{docdb_format}/biblio"
        
        for attempt in range(retries):
            try:
                response = requests.get(biblio_url, headers=headers, timeout=30)
                if response.ok:
                    root = ET.fromstring(response.content)
                    
                    # Extract title
                    title_elem = root.find(".//ep:invention-title[@lang='en']", self.namespaces)
                    result['title'] = title_elem.text if title_elem is not None else ""
                    
                    # Extract inventors
                    inventors = []
                    inventor_elems = root.findall(".//ep:inventor", self.namespaces)
                    for inv in inventor_elems:
                        name_elem = inv.find(".//ep:name", self.namespaces)
                        if name_elem is not None and name_elem.text:
                            inventors.append(name_elem.text.strip())
                    
                    # Extract applicants
                    applicants = []
                    applicant_elems = root.findall(".//ep:applicant", self.namespaces)
                    for app in applicant_elems:
                        name_elem = app.find(".//ep:name", self.namespaces)
                        if name_elem is not None and name_elem.text:
                            applicants.append(name_elem.text.strip())
                    
                    result.update({
                        'inventors': ' || '.join(inventors) if inventors else '',
                        'applicants': ' || '.join(applicants) if applicants else '',
                        'inventors_count': len(inventors),
                        'applicants_count': len(applicants)
                    })
                    break
                    
                else:
                    print(f"⚠️ Biblio failed for {publication_number}: {response.status_code}")
                    time.sleep(1)
                    
            except Exception as e:
                print(f"⚠️ Biblio error for {publication_number} (attempt {attempt + 1}): {e}")
                time.sleep(1)
        
        # Get abstract
        abstract_url = f"https://ops.epo.org/3.2/rest-services/published-data/publication/docdb/{docdb_format}/abstract"
        
        for attempt in range(retries):
            try:
                response = requests.get(abstract_url, headers=headers, timeout=30)
                if response.ok:
                    root = ET.fromstring(response.content)
                    
                    # Extract abstract
                    abstract_elem = root.find(".//ep:abstract[@lang='en']", self.namespaces)
                    if abstract_elem is not None:
                        # Get all text from abstract, including from paragraphs
                        abstract_texts = []
                        for p in abstract_elem.findall(".//ep:p", self.namespaces):
                            if p.text:
                                abstract_texts.append(p.text.strip())
                        
                        if abstract_texts:
                            result['abstract'] = ' '.join(abstract_texts)
                        elif abstract_elem.text:
                            result['abstract'] = abstract_elem.text.strip()
                        else:
                            result['abstract'] = ""
                    else:
                        result['abstract'] = ""
                    break
                    
                else:
                    result['abstract'] = ""
                    if response.status_code != 404:  # 404 is normal for missing abstracts
                        print(f"⚠️ Abstract failed for {publication_number}: {response.status_code}")
                    break
                    
            except Exception as e:
                result['abstract'] = ""
                break
        
        # Get claims
        claims_url = f"https://ops.epo.org/3.2/rest-services/published-data/publication/docdb/{docdb_format}/claims"
        
        for attempt in range(retries):
            try:
                response = requests.get(claims_url, headers=headers, timeout=30)
                if response.ok:
                    root = ET.fromstring(response.content)
                    
                    # Extract claims
                    claims_texts = []
                    claim_elems = root.findall(".//ep:claim[@lang='en']", self.namespaces)
                    for claim in claim_elems:
                        claim_num = claim.get('num', '')
                        claim_text = ""
                        
                        # Get text from paragraphs within claim
                        for p in claim.findall(".//ep:claim-text", self.namespaces):
                            if p.text:
                                claim_text += p.text.strip() + " "
                        
                        if claim_text.strip():
                            claims_texts.append(f"Claim {claim_num}: {claim_text.strip()}")
                    
                    if claims_texts:
                        result['claims'] = ' || '.join(claims_texts[:5])  # Limit to first 5 claims
                    else:
                        result['claims'] = ""
                    break
                    
                else:
                    result['claims'] = ""
                    if response.status_code != 404:  # 404 is normal for missing claims
                        print(f"⚠️ Claims failed for {publication_number}: {response.status_code}")
                    break
                    
            except Exception as e:
                result['claims'] = ""
                break
        
        # Set defaults for missing data
        default_fields = ['title', 'inventors', 'applicants', 'abstract', 'claims']
        for field in default_fields:
            if field not in result:
                result[field] = ""
        
        if 'inventors_count' not in result:
            result['inventors_count'] = 0
        if 'applicants_count' not in result:
            result['applicants_count'] = 0
        
        return result
    
    def get_inpadoc_family_data(self, publication_number: str, retries: int = 3):
        """
        Get INPADOC extended family data with enhanced formatting.
        
        Args:
            publication_number: Patent publication number
            retries: Number of retry attempts
            
        Returns:
            Dictionary with INPADOC family information
        """
        if not self._ensure_token():
            return {}
        
        headers = {"Authorization": f"Bearer {self.access_token}", "Accept": "application/xml"}
        docdb_format = self._convert_to_docdb_format(publication_number)
        
        url = f"https://ops.epo.org/3.2/rest-services/family/publication/docdb/{docdb_format}"
        
        for attempt in range(retries):
            try:
                response = requests.get(url, headers=headers, timeout=30)
                if response.ok:
                    root = ET.fromstring(response.content)
                    
                    # Get family information
                    family_elem = root.find(".//ops:patent-family", self.namespaces)
                    total_count = family_elem.get('total-result-count', '0') if family_elem is not None else '0'
                    
                    # Get all family members
                    family_members = root.findall(".//ops:family-member", self.namespaces)
                    
                    # Collect jurisdictions and members
                    jurisdiction_list = []
                    all_family_members = []
                    
                    for member in family_members:
                        # publication-reference is in the ep namespace
                        pub_ref = member.find(".//ep:publication-reference", self.namespaces)
                        if pub_ref is not None:
                            # document-id elements are also in ep namespace
                            doc_id = pub_ref.find(".//ep:document-id[@document-id-type='docdb']", self.namespaces)
                            if doc_id is not None:
                                country = doc_id.find("ep:country", self.namespaces)
                                number = doc_id.find("ep:doc-number", self.namespaces)
                                kind = doc_id.find("ep:kind", self.namespaces)
                                date = doc_id.find("ep:date", self.namespaces)
                                
                                if all(x is not None and x.text for x in [country, number, kind]):
                                    country_code = country.text
                                    pub_num = f"{country_code}{number.text}{kind.text}"
                                    pub_date = date.text if date is not None and date.text else ""
                                    
                                    # Add to jurisdiction list for each occurrence
                                    jurisdiction_list.append(country_code)
                                    
                                    all_family_members.append({
                                        'publication': pub_num,
                                        'country': country_code,
                                        'date': pub_date
                                    })
                    
                    # Create jurisdiction string with || separator (showing each occurrence)
                    jurisdiction_string = ' || '.join(jurisdiction_list)
                    
                    # Create family members string with || separator
                    family_members_string = ' || '.join([member['publication'] for member in all_family_members])
                    
                    return {
                        'inpadoc_family_size': len(all_family_members),
                        'inpadoc_jurisdictions_count': len(set(jurisdiction_list)),
                        'inpadoc_jurisdiction_list': jurisdiction_string,
                        'inpadoc_family_members': family_members_string
                    }
                    
                else:
                    print(f"⚠️ Family failed for {publication_number}: {response.status_code}")
                    time.sleep(1)
                    
            except Exception as e:
                print(f"⚠️ Family error for {publication_number} (attempt {attempt + 1}): {e}")
                time.sleep(1)
        
        return {}
    
    def create_comprehensive_dataset(self, search_query: str, max_results: int = 25, output_filename: str = None):
        """
        Create comprehensive patent dataset with all features.
        
        Args:
            search_query: EPO search query string
            max_results: Maximum number of patents to process
            output_filename: Output CSV filename (auto-generated if None)
            
        Returns:
            pandas.DataFrame with comprehensive patent data
        """
        if output_filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_filename = f"comprehensive_patent_data_{timestamp}.csv"
        
        print(f"🚀 COMPREHENSIVE EPO PATENT DATA EXTRACTION")
        print(f"✨ Features: Titles, Inventors, Applicants, Abstracts, Claims, INPADOC Family")
        print(f"🔗 Format: || separators for all multi-value fields")
        print(f"Query: {search_query}")
        print(f"Max results: {max_results}")
        print("=" * 80)
        
        # Step 1: Basic search
        print("📝 Step 1: Basic patent search...")
        basic_patents = self.search_patents(search_query, max_results)
        
        if not basic_patents:
            print("❌ No patents found in basic search")
            return None
        
        # Step 2: Get comprehensive data for each patent
        print(f"\\n📊 Step 2: Getting comprehensive data for {len(basic_patents)} patents...")
        comprehensive_data = []
        
        for i, patent in enumerate(basic_patents, 1):
            pub_num = patent['publication_number']
            print(f"  Processing {i}/{len(basic_patents)}: {pub_num}")
            
            # Start with basic data
            patent_data = patent.copy()
            
            # Get detailed data (title, inventors, applicants, abstract, claims)
            detailed_data = self.get_patent_details(pub_num)
            patent_data.update(detailed_data)
            
            # Get INPADOC family data
            family_data = self.get_inpadoc_family_data(pub_num)
            patent_data.update(family_data)
            
            # Show progress
            abstract_status = "✓" if detailed_data.get('abstract', '') else "✗"
            claims_status = "✓" if detailed_data.get('claims', '') else "✗"
            family_size = family_data.get('inpadoc_family_size', 0)
            
            print(f"    📄 Abstract: {abstract_status} | 📋 Claims: {claims_status} | 👥 Family: {family_size} members")
            
            comprehensive_data.append(patent_data)
            
            # Rate limiting
            time.sleep(0.8)
            
            if i % 3 == 0:
                print(f"    ✅ Completed {i}/{len(basic_patents)} patents")
        
        # Step 3: Create DataFrame and export
        print(f"\\n💾 Step 3: Creating comprehensive CSV export...")
        df = pd.DataFrame(comprehensive_data)
        
        # Define complete column structure
        expected_columns = [
            'publication_number', 'country', 'doc_number', 'kind', 'publication_date',
            'title', 'abstract', 'claims',
            'inventors', 'applicants', 'inventors_count', 'applicants_count',
            'inpadoc_family_size', 'inpadoc_jurisdictions_count', 
            'inpadoc_jurisdiction_list', 'inpadoc_family_members', 'search_index'
        ]
        
        for col in expected_columns:
            if col not in df.columns:
                df[col] = ''
        
        # Reorder columns
        df = df[expected_columns]
        
        # Export to CSV
        try:
            df.to_csv(output_filename, index=False, encoding='utf-8-sig')
            print(f"✅ Successfully exported {len(df)} patents to {output_filename}")
            
            
            # INPADOC family statistics
            patents_with_family = len(df[df['inpadoc_family_size'] > 0])
            print(f"Patents with INPADOC family data: {patents_with_family}")
            
            if len(df) > 0:
                avg_abstract_length = df[df['abstract'] != '']['abstract'].str.len().mean() if len(df[df['abstract'] != '']) > 0 else 0
                avg_family_size = df['inpadoc_family_size'].mean()
                
                print(f"\\n📊 STATISTICS:")
                print(f"Average abstract length: {avg_abstract_length:.0f} characters")
                print(f"Average family size: {avg_family_size:.1f} members")
                
                # Show sample format
                print(f"\\n🎯 SAMPLE || SEPARATOR FORMAT:")
                if len(df) > 0:
                    sample = df.iloc[0]
                    print(f"Patent: {sample['publication_number']}")
                    
                    # Show sample data
                    inventors_sample = str(sample['inventors'])[:100] + "..." if len(str(sample['inventors'])) > 100 else str(sample['inventors'])
                    jurisdictions_sample = str(sample['inpadoc_jurisdiction_list'])[:100] + "..." if len(str(sample['inpadoc_jurisdiction_list'])) > 100 else str(sample['inpadoc_jurisdiction_list'])
                    
                    print(f"Inventors: {inventors_sample}")
                    print(f"Jurisdictions: {jurisdictions_sample}")
            
            return df
            
        except Exception as e:
            print(f"❌ Export error: {e}")
            return df

# ==============================================
# MAIN EXECUTION
# ==============================================

if __name__ == "__main__":
    # EPO OPS API Credentials
    CONSUMER_KEY = "KEY"
    CONSUMER_SECRET = "SECRET"
    
    # Initialize the EPO Patent Search System
    epo_system = EPOPatentSearchSystem(CONSUMER_KEY, CONSUMER_SECRET)
    
   
    # Configuration
    search_query = 'pa = "CYTONICS CORP"'  # Change this to your desired search
    max_results = 50
    output_file = "EPO_Patent_Dataset_CYTONICS_CORP.csv"
    
    
    # Execute comprehensive search
    print(f"\\n🚀 EXECUTING COMPREHENSIVE SEARCH...")
    final_dataset = epo_system.create_comprehensive_dataset(
        search_query=search_query,
        max_results=max_results,
        output_filename=output_file
    )
    
    if final_dataset is not None:
        print(f"\\n🎉 SUCCESS! Comprehensive patent dataset created!")
        
    else:
        print(f"\\n❌ Failed to create dataset. Check search query and API connectivity.")
    


\n🚀 EXECUTING COMPREHENSIVE SEARCH...
🚀 COMPREHENSIVE EPO PATENT DATA EXTRACTION
✨ Features: Titles, Inventors, Applicants, Abstracts, Claims, INPADOC Family
🔗 Format: || separators for all multi-value fields
Query: pa = "CYTONICS CORP"
Max results: 50
📝 Step 1: Basic patent search...
🔍 Found 44 patents. Extracting basic data...
  Processed 5/44 patents...
  Processed 10/44 patents...
  Processed 15/44 patents...
  Processed 20/44 patents...
  Processed 25/44 patents...
  Processed 30/44 patents...
  Processed 35/44 patents...
  Processed 40/44 patents...
✅ Extracted basic data for 44 patents
\n📊 Step 2: Getting comprehensive data for 44 patents...
  Processing 1/44: US2025270292A1
    📄 Abstract: ✓ | 📋 Claims: ✗ | 👥 Family: 22 members
  Processing 2/44: US2025205319A1
    📄 Abstract: ✓ | 📋 Claims: ✗ | 👥 Family: 26 members
  Processing 3/44: US2023250154A1
    📄 Abstract: ✓ | 📋 Claims: ✗ | 👥 Family: 22 members
    ✅ Completed 3/44 patents
  Processing 4/44: US2021268078A1
    📄 Abstrac