# Mount Drive

In [1]:
#Change paths to downloaded datasets from https://doi.org/10.7910/DVN/TTP7AO
#OpenAI key can be found in presentation. NEVER push it to Github.
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
with open('/content/gdrive/My Drive/file.txt', 'w') as f:
  f.write('content')

In [3]:
with open('/content/gdrive/My Drive/Hackathon: European Patent Office (EPO)/data/file.txt', 'w') as f:
  f.write('content')

# Register to API + Throttle Limiter


In [4]:
import requests
import base64
import json
from xml.etree import ElementTree as ET
import json
from time import sleep
import random

In [5]:
session = requests.Session()

In [13]:
def renew_token(cred_iterator=None):
  if not cred_iterator: cred_iterator = random.randint(0,2)
  
  #Use keys provided by OPS API
  creds = [base64.b64encode(b''),
          base64.b64encode(b''),
          base64.b64encode(b'')
    ][cred_iterator]
  

  session = requests.Session()
  auth = session.post('https://ops.epo.org/3.2/auth/accesstoken',
              "grant_type=client_credentials",
              headers={
                  'Authorization': f'Basic {creds.decode("utf-8")}',
                  'Content-Type': 'application/x-www-form-urlencoded',
              },
              ).content

  return json.loads(auth)["access_token"]

TOKEN = renew_token()

def throttle_check(request):
  try:
    throttles = request.headers['X-Throttling-Control'].split('(')[1].replace(')', '').replace(' ', '').split(',')
    throttle_dic = {throttle.split('=')[0]:throttle.split('=')[1].split(':')[0] for throttle in throttles}

    if throttle_dic['search']=='black' or throttle_dic['retrieval']=='black':
      print('BLACK ZONE REACHED')
      return sleep(120)
    if throttle_dic['search']=='red' or throttle_dic['retrieval']=='red':
      print('RED ZONE REACHED')
      renew_token()
      return sleep(90)
    if throttle_dic['search']=='yellow' or throttle_dic['retrieval']=='yellow':
      return sleep(10)
    if throttle_dic['search']=='green' or throttle_dic['retrieval']=='green':
      return sleep(0.1)
  except:
    renew_token()
    print('HEADER PROBLEM')

# Claims Parser

In [7]:
def claims_parser(doc_number, content):
  #Check for jurisdiction code
  try: 
    jurisdiction = doc_number[:2]
  except Exception as e: 
    print(f'FAILED to extract code jurisdiction for {doc_number}')
    print(e)

  claims = ""
  try:
    if jurisdiction == 'WO':
      if '@lang' in content['ops:world-patent-data']['ftxt:fulltext-documents']['ftxt:fulltext-document']['claims']:
          if content['ops:world-patent-data']['ftxt:fulltext-documents']['ftxt:fulltext-document']['claims']['@lang'] == 'EN':
              claims = content['ops:world-patent-data']['ftxt:fulltext-documents']['ftxt:fulltext-document']['claims']['claim']['claim-text']
              if type(claims) == dict:
                  claims = claims['$']
              elif type(claims) == list:
                  claims = ' '.join([claim['$'] for claim in claims])
              else:
                  raise Exception("Parse error")
    elif jurisdiction == 'EP':
      if content['ops:world-patent-data']['ftxt:fulltext-documents']['ftxt:fulltext-document']['claims']['@lang'] == 'EN': 
        complete_claims = ' '.join([claims['$'] for claims in content['ops:world-patent-data']['ftxt:fulltext-documents']['ftxt:fulltext-document']['claims']['claim']['claim-text']])

  except Exception as e:
    print(e)
    print(f'FAIL TO RETRIEVE FOR {jurisdiction}, {doc_number}, {content}')
  
  return claims

In [8]:
def get_main_classes(CPC='Y02W', depth=1):
  response = session.get(
        f'http://ops.epo.org/3.2/rest-services/classification/cpc/{CPC}?depth={depth}',
        headers={
            'Accept': "application/json",
            'Authorization': f'Bearer {TOKEN}',
        },
    )
  
  content = json.loads(response.content.decode("utf-8"))
  
  classes = []
  if depth == 1:
    for sub_class in content['ops:world-patent-data']['ops:classification-scheme']['ops:cpc']['cpc:class-scheme']['cpc:classification-item']['cpc:classification-item']['cpc:classification-item']:
      classes += [sub_class['cpc:classification-symbol']['$'].split('/')[0]]

  return classes

main_classes = get_main_classes(CPC='Y02W', depth=1)

#Launching CPC query patent search request

In [14]:
from itertools import product
from time import sleep
from tqdm.auto import tqdm
from google.colab import files

def fetch_index(year_month, range_start, range_end,  CPC='Y02W', jurisdiction = 'WO'):
    response = session.get(
        f'http://ops.epo.org/3.2/rest-services/published-data/search?Range={range_start}-{range_end}&q=pd={year_month} and pn={jurisdiction} and cpc={CPC}',
        headers={
            'Accept': "application/json",
            'Authorization': f'Bearer {TOKEN}',
        },
    )
    throttle_check(response)

    
    if not response:
        print(response.content)
        return []
    content = json.loads(response.content.decode("utf-8"))
    result_count = int(content["ops:world-patent-data"]['ops:biblio-search']["@total-result-count"])
    sleep(1.5)
    return [
        jurisdiction + entry['document-id']['doc-number']['$'] 
        for entry in content['ops:world-patent-data']['ops:biblio-search']['ops:search-result']['ops:publication-reference']
    ], result_count

In [None]:
def fetch_claims(doc_number):
  response = session.get(
        f'http://ops.epo.org/rest-services/published-data/publication/epodoc/{doc_number}/claims',
        headers={
            'Accept': "application/json",
            'Authorization': f'Bearer {TOKEN}',
        },
    )
  throttle_check(response)
  return response

In [16]:
indices = fetch_index('202204', range_start="001", range_end="100", CPC='Y02T', jurisdiction='WO')
DOC_NUMBER = indices[0][0]
print(DOC_NUMBER)
print(fetch_claims(DOC_NUMBER).content)

WO2022083428
b'{"ops:world-patent-data":{"@xmlns":{"ops":"http://ops.epo.org","$":"http://www.epo.org/exchange","xlink":"http://www.w3.org/1999/xlink"},"ftxt:fulltext-documents":{"ftxt:fulltext-document":{"@system":"ops.epo.org","@fulltext-format":"text-only","bibliographic-data":{"publication-reference":{"@data-format":"docdb","document-id":{"country":{"$":"WO"},"doc-number":{"$":"2022083428"},"kind":{"$":"A1"}}}},"claims":{"@lang":"ZH","claim":{"claim-text":[{"$":"\xe4\xb8\x80\xe7\xa7\x8d\xe6\xb1\xbd\xe8\xbd\xa6\xe5\x85\x85\xe7\x94\xb5\xe5\x8f\xa3\xe7\x9b\x96\xe6\x8e\xa7\xe5\x88\xb6\xe6\x96\xb9\xe6\xb3\x95\xef\xbc\x8c\xe5\xba\x94\xe7\x94\xa8\xe4\xba\x8e\xe5\x88\x86\xe5\x88\xab\xe4\xb8\x8e\xe5\xbf\xab\xe5\x85\x85\xe7\x94\xb5\xe5\x8f\xa3\xe7\x9b\x96\xe3\x80\x81\xe6\x85\xa2\xe5\x85\x85\xe7\x94\xb5\xe5\x8f\xa3\xe7\x9b\x96\xe8\xbf\x9e\xe6\x8e\xa5\xe7\x9a\x84\xe6\x8e\xa7\xe5\x88\xb6\xe5\x99\xa8\xef\xbc\x8c\xe5\x85\xb6\xe7\x89\xb9\xe5\xbe\x81\xe5\x9c\xa8\xe4\xba\x8e\xef\xbc\x8c\xe6\x89\x80\

# Mega for loop to download

In [11]:
doc_numbers = []
tolerance = 3
failures = 0
no_publications_for = 0
old_year = 2022
ds = []
CPC = "Y02T"
jurisdiction = "WO"
TOKEN=renew_token()
#Y02A, Y02B, Y02C, Y02D, Y02E, Y02P, Y02T, Y02W
cred_iterator = 0
equivalents_count = 0

try:
  for year, month in tqdm(product(
      list(reversed(range(1820, 2020))),
      list(reversed(range(1, 13))),
  )):
      if old_year != year:
        old_year = year
        #TEMP
        #json.dump(ds, open(f"/content/gdrive/My Drive/Hackathon: European Patent Office (EPO)/data/ds_{jurisdiction}_{CPC}_{year}_final.json", "w"))
        
      if (year == 2022 and month == 11):
          continue

      TOKEN = renew_token()
      month = f"0{month}" if len(str(month)) == 1 else str(month)
      year = str(year)
      year_month = f"{year}{month}"
      index, result_count = fetch_index(year_month, range_start="001", range_end="100", CPC=CPC)
      for i in range(1, 100):
        if result_count<(i*100+2):
          break
        additional_index, _ = fetch_index(year_month, range_start=f"{(i*100)+1}", range_end=f"{(i+1)*100}", CPC=CPC)
        index += additional_index
        #TEMP
        break

      if index == []:
          no_publications_for += 1
          if no_publications_for >= tolerance:
              print(f"Earliest result at {year_month}")
              break
      else:
          no_publications_for = 0
      doc_numbers += index
      
      #Go through all the documents to claims
      for i, doc_number in enumerate(index):
          try:
            response = fetch_claims(doc_number)
          except Exception as e:
            print('no response from server')
            print(e)
            continue

          try:
            status_code = response.status_code
            if status_code == 200:
              content = json.loads(response.content.decode("utf-8"))
            elif status_code == 403:
              cred_iterator += 1
              TOKEN = renew_token(cred_iterator%3)
              continue
            elif status_code == 404:
              equivalents_count += 1
              continue
            

          except Exception as e:
              failed_response = response
              failed_document = doc_number
              print('Failed to decode')
              print(e)

              continue
          try: 
            claims = claims_parser(doc_number, content)
            if claims != "":
              ds += [doc_number, claims]
          except Exception as e:
            print('Claims parser failed')
            print(e)
          
          #TEMP
          if i == 6:break
      #TEMP
      break
except Exception as e:
  print(e)
  #json.dump(ds, open(f"/content/gdrive/My Drive/Hackathon: European Patent Office (EPO)/data/ds_{jurisdiction}_{CPC}_complete_final_2.json", "w"))

0it [00:00, ?it/s]

KeyboardInterrupt: ignored