In [1]:
from dotenv import load_dotenv
import epo_ops
import os
import pandas as pd
import json
import requests
# parameters
import yaml
from time import sleep
from collections import Counter
import utils
import api
import construct_query as cquery
import numpy as np
import re

with open("config.yaml", "r") as stream:
        config = yaml.safe_load(stream)

url = "http://ops.epo.org/3.2/rest-services/published-data/search"
queries = cquery.construct_query(config=config)

# Set up logger
import logging
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', 
                    level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S', 
                    filename='GET_Requests.log', 
                    encoding='utf-8')

# create classes
Sleeper = api.Sleeper(150, 60*60, 30)
AccessToken = api.AccessToken()


In [8]:
queries["EP"]["A"][1]

['(ta ALL "agriculture" OR ta ALL "forestry" OR ta ALL "fishing") AND (ta = "cereals" OR ta = "equines" OR ta = "beverage" OR ta = "aromatic" OR ta = "nuts" OR ta = "leguminous" OR ta = "roots" ) AND cpc any "G06N20/00 G06N20/10 G06N20/20 G06N3/09 G06N3/088 G06N3/092 G06N3/08" AND AP="EP"',
 '(ta ALL "agriculture" OR ta ALL "forestry" OR ta ALL "fishing") AND (ta = "buffaloes" OR ta = "production" OR ta = "tubers" OR ta = "service" OR ta = "pharmaceutical" OR ta = "agriculture" OR ta = "related" ) AND cpc any "G06N20/00 G06N20/10 G06N20/20 G06N3/09 G06N3/088 G06N3/092 G06N3/08" AND AP="EP"',
 '(ta ALL "agriculture" OR ta ALL "forestry" OR ta ALL "fishing") AND (ta = "propagation" OR ta = "pome" OR ta = "hunting" OR ta = "drug" OR ta = "stone" OR ta = "crop" OR ta = "cane" ) AND cpc any "G06N20/00 G06N20/10 G06N20/20 G06N3/09 G06N3/088 G06N3/092 G06N3/08" AND AP="EP"',
 '(ta ALL "agriculture" OR ta ALL "forestry" OR ta ALL "fishing") AND (ta = "sugar" OR ta = "subtropical" OR ta = "goat

In [7]:
with open('data/ops_search_queries.json', 'w') as f:
    json.dump(queries, f, ensure_ascii=False, indent=4)

In [2]:
keywords, industry_keywords, Div_Ind_dict = cquery.get_keywords(config=config)
try:
    retreived_data = pd.read_json(config["paths"]["retrieved_data_content"])
    start_division = retreived_data["division"].max() - 1
    start_industry = Div_Ind_dict[start_division]
except FileNotFoundError:
    start_division = 0
    start_industry = "A"

In [4]:
# Iterater over queries
logging.info(f"Start iterating over queries")
logging.info(f"Starting from industry {start_industry} division {start_division} and")
# insert skipper here if all countries are to be scraped
for industry in queries["EP"].keys():
    # skip already scraped industries
    if industry < start_industry:
        continue
    for division in queries["EP"][industry].keys():
        # skip already scraped divisions
        if division < start_division:
            continue
        for index, query in enumerate(queries["EP"][industry][division]):
            idx = industry + "_" + str(division) + "_" + str(index)
            # make initial query
            r = api.make_request(url=url, query=query, range_begin=1, range_end=100, AccessToken=AccessToken, logging=logging, Sleeper=Sleeper)
            # check if request was successful
            if r.status_code == 200:
                logging.info(f"Request for {idx} successful")
                # get total number of hits
                total_hits = r.json()["ops:world-patent-data"]["ops:biblio-search"]["@total-result-count"]
                # check if total hits is greater than 100
                if int(total_hits) > 100:
                    # get number of requests to make
                    num_requests = int(total_hits) // 100
                    # iterate over requests
                    for i in range(1, num_requests):
                        range_begin_tmp = i*100+1
                        range_end_tmp = (i+1)*100
                        # make request
                        r = api.make_request(url=url, query=query, range_begin=i*100+1, range_end=(i+1)*100, AccessToken=AccessToken, logging=logging, Sleeper=Sleeper)
                        # check if request was successful
                        if r.status_code == 200:
                            # save response
                            json_list = api.extract_content(response=r, country="EP", industry=industry, division=division, query=query, range_begin=i*100+1, range_end=(i+1)*100)
                            utils.write_to_file(retrieved_data_path=config["paths"]["retrieved_data_content"], ls=json_list)
                            # log success
                            logging.info(f"Request for {idx} {i} successful")
                        else:
                            # log failure
                            logging.info(f"Request for {idx} {i} failed")
                        # sleep for 1 second
                        sleep(1)
                else:
                    # save response
                    json_list = api.extract_content(response=r, country="EP", industry=industry, division=division, query=query, range_begin=1, range_end=100)
                    utils.write_to_file(retrieved_data_path=config["paths"]["retrieved_data_content"], ls=json_list)
                    # log success
                    logging.info(f"Request for {idx} successful - {r.status_code}")
                    sleep(1)
            else:
                # log failure
                message = r.text.replace('\n', ' ')
                logging.info(f"Request for {idx} failed - Reasion: {r.status_code} - {message}")