In [None]:
import pandas as pd
from google.cloud import bigquery


In [None]:
%load_ext google.cloud.bigquery

from IPython.display import clear_output
from tqdm import tqdm
import time

import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns

sns.set(style='darkgrid', font_scale=1.5)
#sns.set_style('darkgrid', {"xtick.minor.size": 10, "ytick.major.size": 10})
current_palette = sns.color_palette('deep')
#sns.palplot(current_palette)

# 1. A Word on Different Family Types

### 1.1. DOCDB family (simple family, or Espacenet patent family)

All applications of the same simple family do have the **same priorities**. The technical content of these family members is regarded as **(almost) identical**, so their publications are sometimes called “equivalent”.

This means that the applications that **share the same priorities** (Paris Convention or technical relation or others as contained in table `TLS201_APPLN`, `TLS204_PRIOR_APPLN`, `TLS205_TECH_REL` and `TLS216_APPLN_CONTN`) will be assigned to the same family.

However, the EPO reserves the right to classify an application into a particular simple family irrespective of this general rule - the EPO does this by creating artificial priorities for an application or by ignoring certain priorities (declaring them “inactive”) for the purpose of family building.


By “priority” here, we do not mean only “Paris Convention priority”, but also other types of priorities which link one application to a “prior” application. The various types of priorities are stored in separate tables:
1. TLS201_APPLN An PCT application in its regional/national phase contains in its attribute INTERNAT_APPLN_ID the APPLN_ID of its original PCT application
2. TLS204_APPLN_PRIOR contains Paris Convention priorities
3. TLS205_TECH_REL contains links between technically equivalent applications
4. TLS216_APPLN_CONTN contains various relations like continuations, divisional applications, ...


### 1.2. INPADOC family (Extended family)

All applications of the same extended family are **directly or indirectly** linked to the **same root** priority application. Usually the applications are related to the same technical invention, but their individual content may differ.

This means applications that **share the same priority** `directly `or `indirectly` via other applications. 
A **'priority'** in this case means a **link shown** between applications as in tables `TLS201_APPLN` (regional/national phase of a PCT application), `TLS204_ APPLN_PRIOR` (PARIS convention priorities), `TLS205_TECH_REL` (patents which have been technically linked by patent examiners on the basis of similar content) and table `TLS216_ APPLN_CONTN` (continuations, divisions etc.).

*For the dummy application (i.e. APPLN_ID = 0) and for artificial application replenished because of citations (i.e. APPLN_ID > 930 000 000) the value of the INPADOC_FAMILY_ID will be the same as the value of the APPLN_ID. Thus, each "appln_id" will have the family size of exactly one.*

## 1.3. Our Family ID

For constructing our family ID we have used the following algorithm:
1. Initialize the `‘Relation’` table with four columns (*‘Parent_ID’, ‘Parent_Authority’, ‘Prior_Set’, ‘Children_Set'*) using three patent linkage tables `‘TLS204_APPL_PRIOR’`, `‘TLS205_TECH_REL’`, and `‘TLS216_APPLN_CONTN’` and the application authority data from `‘TLS201_APPLN’` table. The *‘Parent_ID’* and *‘Parent_Authority’* are the same as *‘Application_ID’* and *‘Application_Authority’*, respectively. The *‘Prior_Set’* is the set of all priority filings that each parent id is pointing to. The children set is the set of all ‘Parent_ID’ that are from the same “Parent_Authority” and have the same “Prior_Set”.  
For the applications that are in the table `‘TLS201_APPLN’` but not in the `‘Relation’` table, add their data to the `‘Relation’` table by setting *'Prior_Set'* and *'Children_Set'* initially containing only the *‘Parent_ID’* as their member.
2. Initialize the `‘Family’` table with three columns as (*‘Application_ID’*, *‘Application_Authority’*, *'Parent_Set'*), where *‘Application_ID’* and *‘Application_Authority’* are the same as *‘appln_id’* and *‘appln_auth’* columns from `‘TLS201_APPLN’`, respectively. And initially, *'Parent_Set'* is the set containing only its *'Application_ID'* as its member.
3. While there exists a *‘Parent_Set’* in the `‘Family’` table that is updated:  
     a.	For each *‘Application_ID’*, update the parent IDs in the *'Parent_Set'* using (*'Parent_ID'*, *'Prior_Set'*) pairs in `‘Relation’` table, only if the initial *'Parent Set'* (at the beginning of step 3) is a subset of the *'Children Set'* (for those application IDs that are pointing to several priors, add all of them to the parent set). Flag the parent sets that have been changed.
4. Assign a unique family ID to each distinct *'Parent_Set'* (applications with the same parent set will be located in the same family).
5. Return the final `‘Family’` table.


# 2. BigQuery Implementation of Our Method

**Note:** This implementation is slightly different from the above written algorithm. 

In [1]:
def initialization(linkage_table, appln_table, 
                   relation_table, family_table, 
                   dataset_id, project_id):
    """
    Initializing the relation and family table
    """
    client = bigquery.Client()
    
    ##### Initializing the relation table #####
    # Creating Job Config
    job_config = bigquery.QueryJobConfig()
    #job_config.dry_run = True
    job_config.use_query_cache = False
    # Set configuration.query.writeDisposition
    job_config.write_disposition = 'WRITE_TRUNCATE'

    # Set the destination table
    table_ref = client.dataset(dataset_id, project=project_id).table(relation_table)
    job_config.destination = table_ref

    query = """
    WITH t0 AS(
        SELECT DISTINCT
                appln_id, prior_appln_id, appln_auth
        FROM (
                SELECT appln_id, prior_appln_id, appln_auth
                FROM {}
                LEFT JOIN (
                        SELECT appln_id, appln_auth
                        FROM {}
                ) USING(appln_id)
        )
        UNION ALL (
                SELECT 
                        t2.appln_id AS appln_id, 
                        t2.appln_id AS prior_appln_id,
                        t2.appln_auth AS appln_auth
                FROM {} AS t1
                RIGHT JOIN (
                        SELECT appln_id, appln_auth
                        FROM {}
                        ) AS t2 ON t1.appln_id=t2.appln_id
                WHERE t1.appln_id IS NULL
        )
    ), t1 AS(
        SELECT appln_id, prior_appln_id, appln_auth
        FROM t0
        GROUP BY appln_id, prior_appln_id, appln_auth
    ), a AS(
        SELECT *
        FROM t1
        LEFT JOIN(
                SELECT prior_appln_id, appln_auth, ARRAY_AGG(DISTINCT appln_id ORDER BY appln_id) AS children_set
                FROM t1
                GROUP BY prior_appln_id, appln_auth
        ) USING(prior_appln_id, appln_auth)
    )
    """.format(linkage_table, appln_table, linkage_table, appln_table)
    # Defining the query
    query_job = client.query(query, location='US', job_config=job_config)
    query_job.result()
    
    ##### Initializing the family table ######
    # Creating Job Config
    job_config = bigquery.QueryJobConfig()
    #job_config.dry_run = True
    job_config.use_query_cache = False
    # Set configuration.query.writeDisposition
    job_config.write_disposition = 'WRITE_TRUNCATE'

    # Set the destination table
    table_ref = client.dataset(dataset_id, project=project_id).table(family_table)
    job_config.destination = table_ref

    query = """
    SELECT DISTINCT
            appln_id, appln_id AS prior_appln_id, appln_auth, 1 AS updated
    FROM {}
    GROUP BY appln_id, appln_auth
    )
    """.format(appln_table)
    # Defining the query
    query_job = client.query(query, location='US', job_config=job_config)
    query_job.result()
    
    return None

In [2]:
def updating_step(relation_table, family_table, 
                  dataset_id, project_id):
    
    # Initializing the table names
    dest_table = family_table
    relation_table = '`{0}.{1}.{2}`'.format(project_id, dataset_id, relation_table)
    family_table = '`{0}.{1}.{2}`'.format(project_id, dataset_id, family_table)
    
    client = bigquery.Client()
    # Creating Job Config
    job_config = bigquery.QueryJobConfig()
    #job_config.dry_run = True
    job_config.use_query_cache = False
    # Set configuration.query.writeDisposition
    job_config.write_disposition = 'WRITE_TRUNCATE'
    # Set the destination table
    table_ref = client.dataset(dataset_id, project=project_id).table(dest_table)
    job_config.destination = table_ref

    query="""
    WITH t1 AS(
        SELECT appln_id, prior_appln_id, appln_auth
        FROM {}
        GROUP BY appln_id, prior_appln_id, appln_auth
    ), family_table AS(
        SELECT *
        FROM t0
        LEFT JOIN(
                SELECT appln_id, ARRAY_AGG(DISTINCT prior_appln_id ORDER BY prior_appln_id) AS parent_set
                FROM t0
                GROUP BY appln_id
        ) USING(appln_id)
    ), joined AS(
        SELECT 
                a.appln_id,
                a.prior_appln_id AS prior_appln_b,
                b.prior_appln_id AS prior_appln_a,
                a.parent_set,
                b.children_set,
                a.appln_auth
        FROM family_table AS a
        LEFT JOIN {} AS b ON a.prior_appln_id=b.appln_id
    )

    SELECT DISTINCT
        appln_id,
        appln_auth,
        (CASE WHEN z=0 THEN prior_appln_a ELSE prior_appln_b END) AS prior_appln_id,
        (CASE WHEN z=0 AND prior_appln_a<>prior_appln_b THEN 1 ELSE 0 END) AS updated
    FROM 
        joined AS c,
        UNNEST([(
                SELECT ARRAY_LENGTH(c.children_set) - COUNT(1) 
                FROM UNNEST(c.children_set) AS x
                JOIN UNNEST(c.parent_set) AS y
                ON x=y)]) AS z

    """.format(family_table, relation_table)
    
    # Defining the query
    query_job = client.query(query, location='US', job_config=job_config)

    query_job.result()
    
    return None

In [3]:
def termination_condition(family_table, dataset_id, project_id):
    t1 ='`{0}.{1}.{2}`'.format(project_id, dataset_id, family_table)
    client = bigquery.Client()
    query="""
    SELECT SUM(updated) AS sum_updated, COUNT(updated) AS nb_rows
    FROM {}
    """.format(t1)
    return client.query(query).to_dataframe()

In [4]:
def remove_cycles(family_table, dest_table, 
                  dataset_id, project_id):
    # Initializing the full table name for family table
    family_table = '`{0}.{1}.{2}`'.format(project_id, dataset_id, family_table)
    
    client = bigquery.Client()
    # Creating Job Config
    job_config = bigquery.QueryJobConfig()
    #job_config.dry_run = True
    job_config.use_query_cache = False
    # Set configuration.query.writeDisposition
    job_config.write_disposition = 'WRITE_TRUNCATE'

    # Set the destination table
    table_ref = client.dataset(dataset_id, project=project_id).table(dest_table)
    job_config.destination = table_ref
    
    query="""
    WITH a AS(
        SELECT appln_id, ANY_VALUE(appln_auth) AS appln_auth, prior_appln_id, MIN(updated) AS updated
        FROM {}
        GROUP BY appln_id, prior_appln_id
    )
        SELECT DISTINCT *
        FROM a
        WHERE updated=0
    """.format(family_table)

    # Defining the query
    query_job = client.query(query, location='US', job_config=job_config)

    query_job.result()
    
    return None

In [5]:
def computing_familyID(family_table, dest_table, dataset_id, project_id):
    # Initializing the full table names
    family_table = '`{0}.{1}.{2}`'.format(project_id, dataset_id, family_table)
    
    client = bigquery.Client()
    # Creating Job Config
    job_config = bigquery.QueryJobConfig()
    #job_config.dry_run = True
    job_config.use_query_cache = False
    # Set configuration.query.writeDisposition
    job_config.write_disposition = 'WRITE_TRUNCATE'

    # Set the destination table
    table_ref = client.dataset(dataset_id, project=project_id).table(dest_table)
    job_config.destination = table_ref
    query="""
    WITH b AS(
        SELECT
            appln_id,
            appln_auth,
            (CASE WHEN ARRAY_LENGTH(children_set)=1 THEN prior_appln_id ELSE appln_id END) AS prior_appln_id
        FROM {0} AS a
        LEFT JOIN(
                SELECT prior_appln_id, appln_auth, ARRAY_AGG(DISTINCT appln_id ORDER BY appln_id) AS children_set
                FROM {0}
                GROUP BY prior_appln_id, appln_auth
        ) USING(prior_appln_id, appln_auth)
    ), a AS(
        SELECT 
            *
        FROM b
        LEFT JOIN(
                SELECT 
                    appln_id, 
                    ARRAY_AGG(DISTINCT prior_appln_id ORDER BY prior_appln_id) AS parent_set,
                    TO_JSON_STRING(ARRAY_AGG(DISTINCT prior_appln_id ORDER BY prior_appln_id)) AS priors_str
                FROM b
                GROUP BY appln_id
        ) USING(appln_id)
    )
    
    SELECT DISTINCT *
    FROM(
        SELECT DISTINCT
            appln_id,
            appln_auth,
            family_id
        FROM(
            SELECT 
                appln_id,
                appln_auth,
                c.prior_appln_id
            FROM a
            LEFT JOIN(
                SELECT
                    priors_str,
                    ANY_VALUE(parent_set) AS parent_set,
                    (CASE WHEN ARRAY_LENGTH(ANY_VALUE(parent_set))>1 
                          THEN ARRAY_AGG(DISTINCT appln_id ORDER BY appln_id LIMIT 1) 
                    ELSE ANY_VALUE(parent_set) END) AS prior_appln_id
                FROM a
                GROUP BY priors_str
            ) AS c USING(priors_str)
        ) AS t1, UNNEST(t1.prior_appln_id) AS family_id
    )
    """.format(family_table)

    # Defining the query
    query_job = client.query(query, location='US', job_config=job_config)

    query_job.result()

In [7]:
# Finding family ID
def finding_family(linkage_table, appln_table, # The full table path for these two table
                   family_table, relation_table, final_table, # Only table names for these table
                   dataset_id, project_id):
    start_time = time.time()
    pbar = tqdm(total=100)
    continue_cond = pd.DataFrame(dict(sum_updated=[0], nb_rows=[0]))
    
    initialization(linkage_table=linkage_table, appln_table=appln_table, 
                   relation_table=relation_table, family_table=family_table, 
                   dataset_id=dataset_id, project_id=project_id)
    pbar.update(10)
    
    for i in range(n_iter):
        updating_step(relation_table=relation_table, family_table=family_table, 
                      dataset_id=dataset_id, project_id=project_id)
        previous_updates = continue_cond.loc[0,'sum_updated']
        continue_cond = termination_condition(family_table=family_table, 
                                              dataset_id=dataset_id, 
                                              project_id=project_id)
        
        clear_output(wait=True)
        pbar.update(int(80/n_iter))
        print('Remaining {:,}/{:,}\nThere are {:,} elements that has been finished in the previous step!'
              .format(continue_cond.loc[0,'sum_updated'], continue_cond.loc[0,'nb_rows'], 
                      previous_updates - continue_cond.loc[0,'sum_updated']))
        if not continue_cond.loc[0,'sum_updated']: 
            break
        
    if continue_cond.loc[0,'sum_updated']:
        remove_cycles(family_table=family_table, dest_table=final_table,
                      dataset_id=dataset_id, project_id=project_id)
    pbar.n=90
    pbar.last_printed_n =90
    computing_familyID(family_table=final_table, dest_table=final_table, 
                       dataset_id=dataset_id, project_id=project_id)
    pbar.update(10)
    pbar.close()
    print('It took {:.2f} seconds in total.\nThe number of updates: {} times!'.format(time.time()-start_time, i+1))