In [None]:
import requests
import json
import pandas as pd
import time
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin, quote
import csv
from typing import List, Dict, Any

class PubChemPatentScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.base_url = "https://pubchem.ncbi.nlm.nih.gov"
    
    def get_substance_sid(self, query):
        """Get SID for a substance by name."""
        search_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/name/{quote(query)}/sids/JSON"
        try:
            response = self.session.get(search_url)
            response.raise_for_status()
            data = response.json()
            sid = data['IdentifierList']['SID'][0]
            print(f"Found SID: {sid}")
            return sid
        except Exception as e:
            print(f"Error fetching SID for '{query}': {e}")
            return None
    
    def scrape_patents_from_web(self, query):
        """Scrape patents directly from the PubChem web interface."""
        print(f"Scraping patents for '{query}' from web interface...")
        
        # First get the SID
        sid = self.get_substance_sid(query)
        if not sid:
            return []
        
        # Go to the substance page
        substance_url = f"https://pubchem.ncbi.nlm.nih.gov/substance/{sid}"
        print(f"Accessing: {substance_url}")
        
        try:
            response = self.session.get(substance_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Look for patent-related links or sections
            patents = self.extract_patents_from_page(soup, sid)
            
            if patents:
                return patents
            
            # If no patents found, try the patents-specific page
            patents_url = f"https://pubchem.ncbi.nlm.nih.gov/substance/{sid}#section=Patents"
            print(f"Trying patents section: {patents_url}")
            
            response = self.session.get(patents_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            patents = self.extract_patents_from_page(soup, sid)
            return patents
            
        except Exception as e:
            print(f"Error scraping web page: {e}")
            return []
    
    def extract_patents_from_page(self, soup, sid):
        """Extract patent information from the parsed HTML."""
        patents = []
        
        # Look for various patent-related patterns
        patent_patterns = [
            r'US-\d+[A-Z]*\d*',  # US patents
            r'EP-\d+[A-Z]*\d*',  # European patents
            r'WO-\d+[A-Z]*\d*',  # World patents
            r'JP-\d+[A-Z]*\d*',  # Japanese patents
            r'CN-\d+[A-Z]*\d*',  # Chinese patents
        ]
        
        # Search in all text content
        page_text = soup.get_text()
        for pattern in patent_patterns:
            matches = re.findall(pattern, page_text)
            patents.extend(matches)
        
        # Look for patent links
        links = soup.find_all('a', href=True)
        for link in links:
            href = link.get('href', '')
            text = link.get_text().strip()
            
            # Check if link contains patent information
            for pattern in patent_patterns:
                if re.search(pattern, href) or re.search(pattern, text):
                    match = re.search(pattern, href + ' ' + text)
                    if match:
                        patents.append(match.group())
        
        # Remove duplicates and clean up
        patents = list(set(patents))
        print(f"Found {len(patents)} patents from web scraping")
        
        return patents
    
    def download_patent_data_directly(self, query):
        """Attempt to download patent data using PubChem's download functionality."""
        print(f"Attempting direct patent data download for '{query}'...")
        
        sid = self.get_substance_sid(query)
        if not sid:
            return []
        
        # Try different download URLs that might contain patent data
        download_urls = [
            f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/substance/{sid}/JSON?heading=Patents",
            f"https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi?infmt=json&outfmt=csv&query={{\"download\":\"*\",\"collection\":\"substance\",\"where\":{{\"ands\":[{{\"sid\":\"{sid}\"}}]}},\"order\":[\"relevancescore,desc\"],\"start\":1,\"limit\":10000}}",
            f"https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi?infmt=json&outfmt=json&query={{\"download\":\"*\",\"collection\":\"substance\",\"where\":{{\"ands\":[{{\"sid\":\"{sid}\"}}]}},\"order\":[\"relevancescore,desc\"],\"start\":1,\"limit\":10000}}"
        ]
        
        for url in download_urls:
            try:
                print(f"Trying download URL: {url[:100]}...")
                response = self.session.get(url, timeout=30)
                response.raise_for_status()
                
                # Try to parse as JSON first
                try:
                    data = response.json()
                    print(f"Successfully got JSON data: {len(str(data))} characters")
                    patents = self.extract_patents_from_json(data)
                    if patents:
                        return patents
                except:
                    # Try as CSV
                    if 'csv' in url:
                        print(f"Got CSV data: {len(response.text)} characters")
                        patents = self.extract_patents_from_csv(response.text)
                        if patents:
                            return patents
                
            except Exception as e:
                print(f"Error with download URL: {e}")
                continue
        
        return []
    
    def extract_patents_from_json(self, data):
        """Extract patents from JSON data."""
        patents = []
        
        def recursive_search(obj):
            if isinstance(obj, dict):
                for key, value in obj.items():
                    if 'patent' in key.lower():
                        if isinstance(value, str):
                            patents.append(value)
                        elif isinstance(value, list):
                            patents.extend([str(v) for v in value])
                    recursive_search(value)
            elif isinstance(obj, list):
                for item in obj:
                    recursive_search(item)
        
        recursive_search(data)
        return list(set(patents))
    
    def extract_patents_from_csv(self, csv_text):
        """Extract patents from CSV data."""
        patents = []
        lines = csv_text.split('\n')
        
        for line in lines:
            # Look for patent patterns in each line
            patent_patterns = [
                r'US-\d+[A-Z]*\d*',
                r'EP-\d+[A-Z]*\d*',
                r'WO-\d+[A-Z]*\d*',
                r'JP-\d+[A-Z]*\d*',
                r'CN-\d+[A-Z]*\d*',
            ]
            
            for pattern in patent_patterns:
                matches = re.findall(pattern, line)
                patents.extend(matches)
        
        return list(set(patents))
    
    def search_patents_comprehensive(self, query):
        """Comprehensive patent search using multiple methods."""
        all_patents = []
        
        # Method 1: Web scraping
        print("\n=== Method 1: Web Scraping ===")
        web_patents = self.scrape_patents_from_web(query)
        all_patents.extend(web_patents)
        
        # Method 2: Direct download attempts
        print("\n=== Method 2: Direct Download ===")
        download_patents = self.download_patent_data_directly(query)
        all_patents.extend(download_patents)
        
        # Method 3: Search through related compounds/substances
        print("\n=== Method 3: Related Searches ===")
        related_patents = self.search_related_entries(query)
        all_patents.extend(related_patents)
        
        # Clean and deduplicate
        unique_patents = list(set(all_patents))
        return unique_patents
    
    def search_related_entries(self, query):
        """Search for patents in related compounds or substances."""
        patents = []
        
        try:
            # Try searching for compounds with similar names
            search_variants = [
                query,
                query.replace('-', ' '),
                query.replace(' ', '-'),
                query.lower(),
                query.upper()
            ]
            
            for variant in search_variants[:2]:  # Limit to avoid too many requests
                try:
                    # Search for compounds
                    compound_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{quote(variant)}/cids/JSON"
                    response = self.session.get(compound_url)
                    if response.status_code == 200:
                        data = response.json()
                        cids = data.get('IdentifierList', {}).get('CID', [])
                        
                        for cid in cids[:3]:  # Check first few CIDs
                            compound_patents = self.get_patents_for_cid(cid)
                            patents.extend(compound_patents)
                            time.sleep(0.5)
                
                except Exception as e:
                    print(f"Error searching variant '{variant}': {e}")
                    continue
        
        except Exception as e:
            print(f"Error in related searches: {e}")
        
        return patents
    
    def get_patents_for_cid(self, cid):
        """Get patents for a specific compound CID."""
        try:
            # Try to access the compound's web page
            compound_url = f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
            response = self.session.get(compound_url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            patents = self.extract_patents_from_page(soup, cid)
            
            if patents:
                print(f"Found {len(patents)} patents for CID {cid}")
            
            return patents
            
        except Exception as e:
            print(f"Error getting patents for CID {cid}: {e}")
            return []
    
    def create_patent_dataframe(self, patents, query):
        """Create a DataFrame similar to your manual download format."""
        if not patents:
            return pd.DataFrame()
        
        # Create a basic DataFrame with the patent numbers
        df_data = []
        
        for i, patent in enumerate(patents):
            row = {
                'publicationnumber': patent,
                'cids': '',  # Would need additional API calls to populate
                'sids': '',  # Would need additional API calls to populate  
                'refsids': '',
                'title': f'Patent related to {query}',  # Placeholder
                'abstract': '',  # Would need patent API to get full details
                'prioritydate': '',
                'grantdate': '',
                'inventors': '',
                'assignees': '',
                'classification': '',
                'family': '',
                'aids': '',
                'geneids': '',
                'protacxns': '',
                'taxids': '',
                'anatomyids': ''
            }
            df_data.append(row)
        
        df = pd.DataFrame(df_data)
        return df

# Usage example
def main():
    scraper = PubChemPatentScraper()
    query = "alpha-2-macroglobulin"
    
    print(f"Searching for patents related to '{query}'...")
    patents = scraper.search_patents_comprehensive(query)
    
    if patents:
        print(f"\n=== RESULTS ===")
        print(f"Found {len(patents)} unique patents:")
        
        for i, patent in enumerate(patents, 1):
            print(f"{i:3d}. {patent}")
        
        # Create DataFrame
        df = scraper.create_patent_dataframe(patents, query)
        
        # Save to CSV
        filename = f'patents_{query.replace(" ", "_")}_scraped.csv'
        df.to_csv(filename, index=False)
        print(f"\nResults saved to '{filename}'")
        
        # Display first few rows
        print("\nFirst few rows:")
        print(df.head())
        
    else:
        print("\n=== NO PATENTS FOUND ===")
        print("Consider:")
        print("1. The substance might not have associated patents")
        print("2. Patents might be behind authentication")
        print("3. Different search terms might be needed")
        print("4. Manual download from PubChem might be required")
        
        print(f"\nTry manually visiting:")
        sid = scraper.get_substance_sid(query)
        if sid:
            print(f"https://pubchem.ncbi.nlm.nih.gov/substance/{sid}#section=Patents")

if __name__ == "__main__":
    main()