In [1]:
import pandas as pd
from google.cloud import bigquery
%load_ext google.cloud.bigquery

from IPython.display import clear_output
from tqdm import tqdm
import time

# 1. A Word on Different Family Types

### 1.1. DOCDB family (simple family, or Espacenet patent family)

All applications of the same simple family do have the **same priorities**. The technical content of these family members is regarded as **(almost) identical**, so their publications are sometimes called “equivalent”.

This means that the applications that **share the same priorities** (Paris Convention or technical relation or others as contained in table `TLS201_APPLN`, `TLS204_PRIOR_APPLN`, `TLS205_TECH_REL` and `TLS216_APPLN_CONTN`) will be assigned to the same family.

However, the EPO reserves the right to classify an application into a particular simple family irrespective of this general rule - the EPO does this by creating artificial priorities for an application or by ignoring certain priorities (declaring them “inactive”) for the purpose of family building.


By “priority” here, we do not mean only “Paris Convention priority”, but also other types of priorities which link one application to a “prior” application. The various types of priorities are stored in separate tables:
1. TLS201_APPLN An PCT application in its regional/national phase contains in its attribute INTERNAT_APPLN_ID the APPLN_ID of its original PCT application
2. TLS204_APPLN_PRIOR contains Paris Convention priorities
3. TLS205_TECH_REL contains links between technically equivalent applications
4. TLS216_APPLN_CONTN contains various relations like continuations, divisional applications, ...


### 1.2. INPADOC family (Extended family)

All applications of the same extended family are **directly or indirectly** linked to the **same root** priority application. Usually the applications are related to the same technical invention, but their individual content may differ.

This means applications that **share the same priority** `directly `or `indirectly` via other applications. 
A **'priority'** in this case means a **link shown** between applications as in tables `TLS201_APPLN` (regional/national phase of a PCT application), `TLS204_ APPLN_PRIOR` (PARIS convention priorities), `TLS205_TECH_REL` (patents which have been technically linked by patent examiners on the basis of similar content) and table `TLS216_ APPLN_CONTN` (continuations, divisions etc.).

*For the dummy application (i.e. APPLN_ID = 0) and for artificial application replenished because of citations (i.e. APPLN_ID > 930 000 000) the value of the INPADOC_FAMILY_ID will be the same as the value of the APPLN_ID. Thus, each "appln_id" will have the family size of exactly one.*

## 1.3. Our Custom Family ID

For constructing our family ID we have used the following algorithm:
1. Initialize the `‘Relation’` table with four columns (*‘Parent_ID’, ‘Parent_Authority’, ‘Prior_Set’, ‘Children_Set'*) using three patent linkage tables `‘TLS204_APPL_PRIOR’`, `‘TLS205_TECH_REL’`, and `‘TLS216_APPLN_CONTN’` and the application authority data from `‘TLS201_APPLN’` table. The *‘Parent_ID’* and *‘Parent_Authority’* are the same as *‘Application_ID’* and *‘Application_Authority’*, respectively. The *‘Prior_Set’* is the set of all priority filings that each parent id is pointing to. The children set is the set of all ‘Parent_ID’ that are from the same “Parent_Authority” and have the same “Prior_Set”.  
For the applications that are in the table `‘TLS201_APPLN’` but not in the `‘Relation’` table, add their data to the `‘Relation’` table by setting *'Prior_Set'* and *'Children_Set'* initially containing only the *‘Parent_ID’* as their member.
2. Initialize the `‘Family’` table with three columns as (*‘Application_ID’*, *‘Application_Authority’*, *'Parent_Set'*), where *‘Application_ID’* and *‘Application_Authority’* are the same as *‘appln_id’* and *‘appln_auth’* columns from `‘TLS201_APPLN’`, respectively. And initially, *'Parent_Set'* is the set containing only its *'Application_ID'* as its member.
3. While there exists a *‘Parent_Set’* in the `‘Family’` table that is updated:  
     a.	For each *‘Application_ID’*, update the parent IDs in the *'Parent_Set'* using (*'Parent_ID'*, *'Prior_Set'*) pairs in `‘Relation’` table, only if the initial *'Parent Set'* (at the beginning of step 3) is a subset of the *'Children Set'* (for those application IDs that are pointing to several priors, add all of them to the parent set). Flag the parent sets that have been changed.
4. Assign a unique family ID to each distinct *'Parent_Set'* (applications with the same parent set will be located in the same family).
5. Return the final `‘Family’` table.

**More information on this family ID can be found in the "Appendix B" of the paper.**

# 2. BigQuery Implementation of Our Method

**Note:** This implementation is slightly different from the above written algorithm. 

In [16]:
# Second version of Initialization Function (Main Algorithm)
def initialization(linkage_table, appln_table, 
                   relation_table, family_table, 
                   dataset_id, project_id):
    """
    Initializing the relation and family table
    """
    client = bigquery.Client()
    
    ##### Initializing the relation table #####
    # Creating Job Config
    job_config = bigquery.QueryJobConfig()
    #job_config.dry_run = True
    job_config.use_query_cache = False
    # Set configuration.query.writeDisposition
    job_config.write_disposition = 'WRITE_TRUNCATE'

    # Set the destination table
    table_ref = client.dataset(dataset_id, project=project_id).table(relation_table)
    job_config.destination = table_ref

    query = """
    WITH t0 AS(
        SELECT DISTINCT
                appln_id, prior_appln_id, appln_auth
        FROM (
                SELECT appln_id, prior_appln_id, appln_auth
                FROM {}
                LEFT JOIN (
                        SELECT appln_id, appln_auth
                        FROM {}
                ) USING(appln_id)
        )
        UNION ALL (
                SELECT 
                        t2.appln_id AS appln_id, 
                        t2.appln_id AS prior_appln_id,
                        t2.appln_auth AS appln_auth
                FROM {} AS t1
                RIGHT JOIN (
                        SELECT appln_id, appln_auth
                        FROM {}
                        ) AS t2 ON t1.appln_id=t2.appln_id
                WHERE t1.appln_id IS NULL
        )
    ), t1 AS(
        SELECT appln_id, prior_appln_id, appln_auth
        FROM t0
        GROUP BY appln_id, prior_appln_id, appln_auth
    )
    
    SELECT *
    FROM t1
    LEFT JOIN(
            SELECT prior_appln_id, appln_auth, ARRAY_AGG(DISTINCT appln_id ORDER BY appln_id) AS children_set
            FROM t1
            GROUP BY prior_appln_id, appln_auth
    ) USING(prior_appln_id, appln_auth)
    """.format(linkage_table, appln_table, linkage_table, appln_table)
    # Defining the query
    query_job = client.query(query, location='US', job_config=job_config)
    query_job.result()
    
    ##### Initializing the family table ######
    # Creating Job Config
    job_config = bigquery.QueryJobConfig()
    #job_config.dry_run = True
    job_config.use_query_cache = False
    # Set configuration.query.writeDisposition
    job_config.write_disposition = 'WRITE_TRUNCATE'

    # Set the destination table
    table_ref = client.dataset(dataset_id, project=project_id).table(family_table)
    job_config.destination = table_ref

    query = """
    SELECT DISTINCT
            appln_id, appln_id AS prior_appln_id, appln_auth, 1 AS updated
    FROM {}
    GROUP BY appln_id, appln_auth
    """.format(appln_table)
    # Defining the query
    query_job = client.query(query, location='US', job_config=job_config)
    query_job.result()
    
    return None

In [10]:
# Second Version of Update function (Main Algorithm)
def updating_step(relation_table, family_table, 
                  dataset_id, project_id):
    
    # Initializing the table names
    dest_table = family_table
    relation_table = '`{0}.{1}.{2}`'.format(project_id, dataset_id, relation_table)
    family_table = '`{0}.{1}.{2}`'.format(project_id, dataset_id, family_table)
    
    client = bigquery.Client()
    # Creating Job Config
    job_config = bigquery.QueryJobConfig()
    #job_config.dry_run = True
    job_config.use_query_cache = False
    # Set configuration.query.writeDisposition
    job_config.write_disposition = 'WRITE_TRUNCATE'
    # Set the destination table
    table_ref = client.dataset(dataset_id, project=project_id).table(dest_table)
    job_config.destination = table_ref

    query="""
    WITH t1 AS(
        SELECT appln_id, prior_appln_id, appln_auth
        FROM {}
        GROUP BY appln_id, prior_appln_id, appln_auth
    ), family_table AS(
        SELECT *
        FROM t1
        LEFT JOIN(
                SELECT appln_id, ARRAY_AGG(DISTINCT prior_appln_id ORDER BY prior_appln_id) AS parent_set
                FROM t1
                GROUP BY appln_id
        ) USING(appln_id)
    ), joined AS(
        SELECT 
                a.appln_id,
                a.prior_appln_id AS prior_appln_b,
                b.prior_appln_id AS prior_appln_a,
                a.parent_set,
                b.children_set,
                a.appln_auth
        FROM family_table AS a
        LEFT JOIN {} AS b ON a.prior_appln_id=b.appln_id
    )

    SELECT DISTINCT
        appln_id,
        appln_auth,
        (CASE WHEN z=0 THEN prior_appln_a ELSE prior_appln_b END) AS prior_appln_id,
        (CASE WHEN z=0 AND prior_appln_a<>prior_appln_b THEN 1 ELSE 0 END) AS updated
    FROM 
        joined AS c,
        UNNEST([(
                SELECT ARRAY_LENGTH(c.children_set) - COUNT(1) 
                FROM UNNEST(c.children_set) AS x
                JOIN UNNEST(c.parent_set) AS y
                ON x=y)]) AS z

    """.format(family_table, relation_table)
    
    # Defining the query
    query_job = client.query(query, location='US', job_config=job_config)

    query_job.result()
    
    return None

In [None]:
## Second iteration of Update function
def updating_step(relation_table, family_table, 
                  dataset_id, project_id):
    
    # Initializing the table names
    dest_family = family_table
    dest_relation = relation_table
    relation_table = '`{0}.{1}.{2}`'.format(project_id, dataset_id, relation_table)
    family_table = '`{0}.{1}.{2}`'.format(project_id, dataset_id, family_table)
    
    client = bigquery.Client()
    # Creating Job Config
    job_config = bigquery.QueryJobConfig()
    #job_config.dry_run = True
    job_config.use_query_cache = False
    # Set configuration.query.writeDisposition
    job_config.write_disposition = 'WRITE_TRUNCATE'
    # Set the destination table
    table_ref = client.dataset(dataset_id, project=project_id).table(dest_table)
    job_config.destination = table_ref

    query="""
    WITH t1 AS(
        SELECT appln_id, prior_appln_id, appln_auth
        FROM {}
        GROUP BY appln_id, prior_appln_id, appln_auth
    ), family_table AS(
        SELECT *
        FROM t1
        LEFT JOIN(
                SELECT appln_id, ARRAY_AGG(DISTINCT prior_appln_id ORDER BY prior_appln_id) AS parent_set
                FROM t1
                GROUP BY appln_id
        ) USING(appln_id)
    ), joined AS(
        SELECT 
                a.appln_id,
                a.prior_appln_id AS prior_appln_b,
                b.prior_appln_id AS prior_appln_a,
                a.parent_set,
                b.children_set,
                a.appln_auth
        FROM family_table AS a
        LEFT JOIN {} AS b ON a.prior_appln_id=b.appln_id
    )

    SELECT DISTINCT
        appln_id,
        appln_auth,
        (CASE WHEN z=0 THEN prior_appln_a ELSE prior_appln_b END) AS prior_appln_id,
        (CASE WHEN z=0 AND prior_appln_a<>prior_appln_b THEN 1 ELSE 0 END) AS updated
    FROM 
        joined AS c,
        UNNEST([(
                SELECT ARRAY_LENGTH(c.children_set) - COUNT(1) 
                FROM UNNEST(c.children_set) AS x
                JOIN UNNEST(c.parent_set) AS y
                ON x=y)]) AS z

    """.format(family_table, relation_table)
    
    # Defining the query
    query_job = client.query(query, location='US', job_config=job_config)

    query_job.result()
    
    return None

In [11]:
def termination_condition(family_table, dataset_id, project_id):
    t1 ='`{0}.{1}.{2}`'.format(project_id, dataset_id, family_table)
    client = bigquery.Client()
    query="""
    SELECT SUM(updated) AS sum_updated, COUNT(updated) AS nb_rows
    FROM {}
    """.format(t1)
    return client.query(query).to_dataframe()

In [12]:
def remove_cycles(family_table, dest_table, 
                  dataset_id, project_id):
    # Initializing the full table name for family table
    family_table = '`{0}.{1}.{2}`'.format(project_id, dataset_id, family_table)
    
    client = bigquery.Client()
    # Creating Job Config
    job_config = bigquery.QueryJobConfig()
    #job_config.dry_run = True
    job_config.use_query_cache = False
    # Set configuration.query.writeDisposition
    job_config.write_disposition = 'WRITE_TRUNCATE'

    # Set the destination table
    table_ref = client.dataset(dataset_id, project=project_id).table(dest_table)
    job_config.destination = table_ref
    
    query="""
    WITH a AS(
        SELECT appln_id, ANY_VALUE(appln_auth) AS appln_auth, prior_appln_id, MIN(updated) AS updated
        FROM {}
        GROUP BY appln_id, prior_appln_id
    )
        SELECT DISTINCT *
        FROM a
        WHERE updated=0
    """.format(family_table)

    # Defining the query
    query_job = client.query(query, location='US', job_config=job_config)

    query_job.result()
    
    return None

In [13]:
def computing_familyID(family_table, dest_table, dataset_id, project_id):
    # Initializing the full table names
    family_table = '`{0}.{1}.{2}`'.format(project_id, dataset_id, family_table)
    
    client = bigquery.Client()
    # Creating Job Config
    job_config = bigquery.QueryJobConfig()
    #job_config.dry_run = True
    job_config.use_query_cache = False
    # Set configuration.query.writeDisposition
    job_config.write_disposition = 'WRITE_TRUNCATE'

    # Set the destination table
    table_ref = client.dataset(dataset_id, project=project_id).table(dest_table)
    job_config.destination = table_ref
    query="""
    WITH b AS(
        SELECT
            appln_id,
            appln_auth,
            (CASE WHEN ARRAY_LENGTH(children_set)=1 THEN prior_appln_id ELSE appln_id END) AS prior_appln_id
        FROM {0} AS a
        LEFT JOIN(
                SELECT prior_appln_id, appln_auth, ARRAY_AGG(DISTINCT appln_id ORDER BY appln_id) AS children_set
                FROM {0}
                GROUP BY prior_appln_id, appln_auth
        ) USING(prior_appln_id, appln_auth)
    ), a AS(
        SELECT 
            *
        FROM b
        LEFT JOIN(
                SELECT 
                    appln_id, 
                    ARRAY_AGG(DISTINCT prior_appln_id ORDER BY prior_appln_id) AS parent_set,
                    TO_JSON_STRING(ARRAY_AGG(DISTINCT prior_appln_id ORDER BY prior_appln_id)) AS priors_str
                FROM b
                GROUP BY appln_id
        ) USING(appln_id)
    )
    
    SELECT DISTINCT *
    FROM(
        SELECT DISTINCT
            appln_id,
            appln_auth,
            family_id
        FROM(
            SELECT 
                appln_id,
                appln_auth,
                c.prior_appln_id
            FROM a
            LEFT JOIN(
                SELECT
                    priors_str,
                    ANY_VALUE(parent_set) AS parent_set,
                    (CASE WHEN ARRAY_LENGTH(ANY_VALUE(parent_set))>1 
                          THEN ARRAY_AGG(DISTINCT appln_id ORDER BY appln_id LIMIT 1) 
                    ELSE ANY_VALUE(parent_set) END) AS prior_appln_id
                FROM a
                GROUP BY priors_str
            ) AS c USING(priors_str)
        ) AS t1, UNNEST(t1.prior_appln_id) AS family_id
    )
    """.format(family_table)

    # Defining the query
    query_job = client.query(query, location='US', job_config=job_config)

    query_job.result()

In [14]:
# Finding family ID
def finding_family(linkage_table, appln_table, # The full table path for these two table
                   family_table, relation_table, final_table, # Only table names for these table
                   dataset_id, project_id, n_iter=10):
    start_time = time.time()
    pbar = tqdm(total=100)
    continue_cond = pd.DataFrame(dict(sum_updated=[0], nb_rows=[0]))
    
    initialization(linkage_table=linkage_table, appln_table=appln_table, 
                   relation_table=relation_table, family_table=family_table, 
                   dataset_id=dataset_id, project_id=project_id)
    pbar.update(10)
    
    for i in range(n_iter):
        updating_step(relation_table=relation_table, family_table=family_table, 
                      dataset_id=dataset_id, project_id=project_id)
        previous_updates = continue_cond.loc[0,'sum_updated']
        continue_cond = termination_condition(family_table=family_table, 
                                              dataset_id=dataset_id, 
                                              project_id=project_id)
        
        clear_output(wait=True)
        pbar.update(int(80/n_iter))
        print('Remaining {:,}/{:,}\nThere are {:,} elements that has been finished in the previous step!'
              .format(continue_cond.loc[0,'sum_updated'], continue_cond.loc[0,'nb_rows'], 
                      previous_updates - continue_cond.loc[0,'sum_updated']))
        if not continue_cond.loc[0,'sum_updated']: 
            break
        
    if continue_cond.loc[0,'sum_updated']:
        remove_cycles(family_table=family_table, dest_table=final_table,
                      dataset_id=dataset_id, project_id=project_id)
    pbar.n=90
    pbar.last_printed_n =90
    computing_familyID(family_table=final_table, dest_table=final_table, 
                       dataset_id=dataset_id, project_id=project_id)
    pbar.update(10)
    pbar.close()
    print('It took {:.2f} seconds in total.\nThe number of updates: {} times!'.format(time.time()-start_time, i+1))

# 3. Running the algorithm

## 3.1. Creating the linkage patent table containing three table from PATSTAT

In [15]:
client = bigquery.Client()
# Creating Job Config
job_config = bigquery.QueryJobConfig()
#job_config.dry_run = True
job_config.use_query_cache = False
# Set configuration.query.writeDisposition
job_config.write_disposition = 'WRITE_TRUNCATE'

# Set the destination table
table_id = 'linkage_table'
dataset_id = 'custom_family'
project_id = 'usptobias'
table_ref = client.dataset(dataset_id, project=project_id).table(table_id)
job_config.destination = table_ref

query="""
SELECT appln_id, prior_appln_id
FROM `usptobias.patstat.tls204` UNION ALL
SELECT appln_id, tech_rel_appln_id AS prior_appln_id
FROM `usptobias.patstat.tls205` UNION ALL
SELECT appln_id, parent_appln_id AS prior_appln_id
FROM `usptobias.patstat.tls216`
"""

# Defining the query
query_job = client.query(query, location='US', job_config=job_config)

query_job.result()


<google.cloud.bigquery.table.RowIterator at 0x1bbdbc68fd0>

## 3.2. Computing the family ID

In [15]:
# Creating the family_id using only "priority dataset"
finding_family(linkage_table = '`usptobias.custom_family.linkage_table`', 
               appln_table = '`usptobias.patstat.tls201`',
               family_table = 'family_table_2', relation_table = 'relation_table_2', 
               final_table = 'family_customDef_2',
               dataset_id = 'custom_family', 
               project_id = 'usptobias', n_iter=10)

 90%|████████████████████████████████████████████████████████████████████████▉        | 90/100 [26:23<02:52, 17.21s/it]

Remaining 6,921,507/110,056,972
There are 771,938 elements that has been finished in the previous step!


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [29:37<00:00, 17.85s/it]


It took 1777.06 seconds in total.
The number of updates: 10 times!


# 4. Validating the results

There is no straight-forward method or dataset for the validation of the results of our algorithm. Thus, we try to investigate the quality of the results by randomly looking at different families with different characteristics.

## 4.1. First example: Investigating a sample family with IPC code 'F01B'

In this example we look for the applications in the "Mechanical Engineering" technical field using `TLS209_APPLN_IPC` and `TLS201_APPLN` tables in PATSTAT v2018b.
For this purpose, we look for any applications with IPC class of **F01B**. 

In the following query we look for families with more than one member with applications that are filed between the years 2002 and 2010 and are categorized in the IPC "F01B" group. Then, we take the first resulting family as a sample.

In [6]:
%%bigquery df_
WITH a AS(
  SELECT * 
  FROM( SELECT *
        FROM( SELECT appln_id, ipc_class_symbol
              FROM `usptobias.patstat.tls209`
              WHERE REGEXP_CONTAINS(ipc_class_symbol, r'.*F01B.*') ) AS t1
        INNER JOIN (  SELECT appln_id, appln_filing_year, appln_kind, appln_nr
                      FROM `usptobias.patstat.tls201` ) AS t2 USING(appln_id) )
        WHERE appln_filing_year>=2002 AND appln_filing_year<2012 
)

SELECT *
FROM(   SELECT t1.*, a.* EXCEPT(appln_id), t2.prior_appln_id
        FROM `usptobias.custom_family.custom_familyDef` AS t1
        INNER JOIN a USING(appln_id)
        INNER JOIN `usptobias.patstat.tls204` AS t2 USING(appln_id)  )
INNER JOIN( SELECT family_id, COUNT(appln_id) AS cnt_family
            FROM `usptobias.custom_family.custom_familyDef`
            GROUP BY family_id
            HAVING COUNT(appln_id)>1 ) USING(family_id)
ORDER BY family_id
LIMIT 10000

In [9]:
df_.head(3)

Unnamed: 0,family_id,appln_id,appln_auth,ipc_class_symbol,appln_filing_year,appln_kind,appln_nr,prior_appln_id,cnt_family
0,17440,457938321,UA,F01B 3/00,2009,A,201012301,17440,8
1,17440,58055494,WO,F01B 3/04,2009,W,2009050440,17440,8
2,17440,336156243,EA,F01B 3/04,2009,A,201071091,17440,8


Now, we look at the family information of all applications that are in the same custom family (`=17440`) or have the same DOCDB family as `39587978`, which is the DOCDB family ID of applications in this custom family.

In [21]:
%%bigquery
SELECT 
    appln_id, appln_auth, appln_nr, family_id, docdb_family_id, docdb_family_size, appln_filing_year, prior_appln_id
FROM (  SELECT *
        FROM `usptobias.custom_family.custom_familyDef`  )
INNER JOIN( SELECT 
                appln_id, appln_filing_year, appln_nr, docdb_family_id, docdb_family_size
            FROM `usptobias.patstat.tls201` ) USING(appln_id)
INNER JOIN `usptobias.patstat.tls204` AS t2 USING(appln_id)
WHERE family_id=17440 OR appln_id=17440 OR docdb_family_id=39587978
ORDER BY appln_auth, family_id

Unnamed: 0,appln_id,appln_auth,appln_nr,family_id,docdb_family_id,docdb_family_size,appln_filing_year,prior_appln_id
0,333354877,CN,200980113793,17440,39587978,16,2009,17440
1,336156243,EA,201071091,17440,39587978,16,2009,17440
2,267540628,EP,9726478,267540628,39587978,16,2009,17440
3,409697777,EP,13181010,409697777,39587978,16,2009,17440
4,425536209,ES,9726478,425536209,39587978,16,2009,17440
5,486085187,ES,13181010,486085187,39587978,16,2009,17440
6,331954650,IL,20814810,17440,39587978,16,2010,17440
7,334541922,JP,2011500269,334541922,39587978,16,2009,17440
8,443300671,JP,2015036329,443300671,39587978,16,2015,17440
9,331452440,KR,20107023071,17440,39587978,16,2009,17440


We can see that the algorithm is doing what we wanted to achieve. This means that it is excluding applications that if we group by `prior_appln_id` and `appln_auth` we will get a set with more than one member. For example, applications with `appln_id=	267540628` and `appln_id=409697777` both have the same application authority (which is "EP") and both are pointing to the same priority filing (which is application ID `17440`).

## 4.2. Second Example: Looking at a sample family with at least one US application

We now look for applications between the years 2002 and 2012 and with IPC code "F01B", which has at least one US application.

In [22]:
%%bigquery df_
WITH a AS(
  SELECT * 
  FROM( SELECT *
        FROM( SELECT appln_id, ipc_class_symbol
              FROM `usptobias.patstat.tls209`
              WHERE REGEXP_CONTAINS(ipc_class_symbol, r'.*F01B.*') ) AS t1
        INNER JOIN (  SELECT appln_id, appln_filing_year, appln_kind, appln_nr
                      FROM `usptobias.patstat.tls201` ) AS t2 USING(appln_id) )
        WHERE appln_filing_year>=2002 AND appln_filing_year<2012 
)

SELECT *
FROM(   SELECT t1.*, a.* EXCEPT(appln_id), t2.prior_appln_id
        FROM `usptobias.custom_family.custom_familyDef` AS t1
        INNER JOIN a USING(appln_id)
        INNER JOIN `usptobias.patstat.tls204` AS t2 USING(appln_id)  )
INNER JOIN( SELECT family_id, COUNT(appln_id) AS cnt_family
            FROM `usptobias.custom_family.custom_familyDef`
            GROUP BY family_id
            HAVING COUNT(appln_id)>1 ) USING(family_id)
WHERE appln_auth='US'
ORDER BY family_id
LIMIT 10000

In [23]:
df_.head(3)

Unnamed: 0,family_id,appln_id,appln_auth,ipc_class_symbol,appln_filing_year,appln_kind,appln_nr,prior_appln_id,cnt_family
0,17441,332856209,US,F01B 7/02,2009,A,73618309,17441,12
1,17441,332856209,US,F01B 9/06,2009,A,73618309,17441,12
2,17441,332856209,US,F01B 3/04,2009,A,73618309,17441,12


We choose family ID `17441` as a sample. We then search for all applications within this custom family ID or applications with DOCDB family ID ``, which is the DOCDB family ID of the applications within this custom family ID

In [30]:
%%bigquery
SELECT 
    appln_id, appln_auth, appln_nr, family_id, docdb_family_id, docdb_family_size, appln_filing_year, 
    IFNULL(CAST(prior_appln_id AS STRING), 'None') prior_appln_id
FROM (  SELECT *
        FROM `usptobias.custom_family.custom_familyDef`  )
FULL OUTER JOIN( SELECT 
                appln_id, appln_filing_year, appln_nr, docdb_family_id, docdb_family_size
            FROM `usptobias.patstat.tls201` ) USING(appln_id)
FULL OUTER JOIN `usptobias.patstat.tls204` AS t2 USING(appln_id)
WHERE family_id=17441 OR appln_id=17441 OR docdb_family_id=39563417
ORDER BY appln_auth, family_id

Unnamed: 0,appln_id,appln_auth,appln_nr,family_id,docdb_family_id,docdb_family_size,appln_filing_year,prior_appln_id
0,332388962,CN,200980109539,17441,39563417,12,2009,17441.0
1,336156244,EA,201071092,17441,39563417,12,2009,17441.0
2,267541145,EP,9726995,17441,39563417,12,2009,17441.0
3,380390282,ES,9726995,17441,39563417,12,2009,17441.0
4,17441,FR,801437,17441,39563417,12,2008,
5,331954651,IL,20814910,17441,39563417,12,2010,17441.0
6,334541923,JP,2011500270,17441,39563417,12,2009,17441.0
7,332346431,KR,20107023073,17441,39563417,12,2009,17441.0
8,457938319,UA,201012300,17441,39563417,12,2009,17441.0
9,332856209,US,73618309,17441,39563417,12,2009,17441.0


We can see that all applications are from unique application authorities, thus all of them are correct and agrees with the associated DOCDB family.

## 4.3. Third Example: Looking at a custom family with more than one priority filings

We now search for a sample of custom families with applications between the years 2002-2012, an IPC code of 'F01B', with at least one "US" application, and with more than one priority filing.

In [31]:
%%bigquery df_
WITH a AS(
  SELECT * 
  FROM( SELECT *
        FROM( SELECT appln_id, ipc_class_symbol
              FROM `usptobias.patstat.tls209`
              WHERE REGEXP_CONTAINS(ipc_class_symbol, r'.*F01B.*') ) AS t1
        INNER JOIN (  SELECT appln_id, appln_filing_year, appln_kind, appln_nr
                      FROM `usptobias.patstat.tls201` ) AS t2 USING(appln_id) )
        WHERE appln_filing_year>=2002 AND appln_filing_year<2012 
)

SELECT *
FROM(   SELECT t1.*, a.* EXCEPT(appln_id), t2.prior_appln_id
        FROM `usptobias.custom_family.custom_familyDef` AS t1
        INNER JOIN a USING(appln_id)
        INNER JOIN `usptobias.patstat.tls204` AS t2 USING(appln_id)  )
INNER JOIN( SELECT family_id, COUNT(appln_id) AS cnt_family
            FROM `usptobias.custom_family.custom_familyDef`
            GROUP BY family_id
            HAVING COUNT(appln_id)>1 ) USING(family_id)
WHERE appln_auth='US' AND family_id<>prior_appln_id
ORDER BY family_id
LIMIT 10000

In [32]:
df_.head(3)

Unnamed: 0,family_id,appln_id,appln_auth,ipc_class_symbol,appln_filing_year,appln_kind,appln_nr,prior_appln_id,cnt_family
0,72673,274357034,US,F01B 21/04,2008,A,45019108,55544732,4
1,72673,274357034,US,F01B 21/04,2008,A,45019108,55233300,4
2,85902,325208507,US,F01B 3/00,2008,A,66800008,901576706,5


We choose the first custom family ID as a sample. Thus, we search for applications with family ID `72673` or DOCDB family IDs `39831474`, `39978639`, or `40141977` (The DOCDB families of the applications in the custom family ID of `72673`)

In [38]:
%%bigquery
SELECT 
    appln_id, appln_auth, appln_nr, family_id, docdb_family_id, docdb_family_size, appln_filing_year, 
    IFNULL(CAST(prior_appln_id AS STRING), 'None') prior_appln_id
FROM (  SELECT *
        FROM `usptobias.custom_family.custom_familyDef`  )
FULL OUTER JOIN( SELECT 
                appln_id, appln_filing_year, appln_nr, docdb_family_id, docdb_family_size
            FROM `usptobias.patstat.tls201` ) USING(appln_id)
FULL OUTER JOIN `usptobias.patstat.tls204` AS t2 USING(appln_id)
WHERE family_id=72673 OR appln_id=55233300 OR appln_id=55544732 OR docdb_family_id=39831474 OR docdb_family_id=39978639
OR docdb_family_id=40141977
ORDER BY appln_auth, family_id

Unnamed: 0,appln_id,appln_auth,appln_nr,family_id,docdb_family_id,docdb_family_size,appln_filing_year,prior_appln_id
0,274702168,CN,200880011231,72673,39978639,2,2008,55233300.0
1,274702168,CN,200880011231,72673,39978639,2,2008,55544732.0
2,54949232,EP,8737383,72673,39831474,3,2008,55233300.0
3,54949232,EP,8737383,72673,39831474,3,2008,55544732.0
4,55233300,JP,2007100080,55233300,39978639,2,2007,
5,55544732,JP,2007127128,55544732,40141977,1,2007,
6,274357034,US,45019108,72673,39831474,3,2008,55544732.0
7,274357034,US,45019108,72673,39831474,3,2008,55233300.0
8,72673,WO,2008000812,72673,39831474,3,2008,55233300.0
9,72673,WO,2008000812,72673,39831474,3,2008,55544732.0


We see that again the algorithm has achieved what it was meant to do. It has added all applications with the same of priority filings into one family (with custom family ID `72673`), while it excluded the two applications with `55233300` and `55233300` IDs, since they are both from the Japan jurisdictions.

However, this result might be more controversial compared to the previous ones, since we have two different DOCDB families for the custom family ID `72673`. Although, this is what we have expected from the algorithm to do, we want to check whether the logic that we followed is giving us a more meaningful result as a family definition.

Let's investigate the claims of these applications and see whether the members of this custom family ID are covering the same invention or not.

We thus looked at their publications at the Google Patents and compared their claims. The result of this comparison can be found from the following table:

| Claims/Jurisdictions | JP (\*) | JP (\*) | US | CN | EP | WO |
| --- | --- | --- | --- | --- | --- | --- |
| Publication Number | JP-4265675-B2 | JP-4321623-B2 | US-8291697-B2 | CN-101652551-B | EP-2145092-B1 | WO-2008122866-A2 |
| Application Number | 2007100080 | 2007127128 | 45019108 | 200880011231 | 08737383 | 2008000812 |
| Granted | No | Yes ** | Yes ** | Yes ** | Yes ** | Yes ** | No |
| Abstract | NA | NA | (Base) | Same | NA | Same | 
| 1st Claim | 1 | - | 1 | 1 | 1 | 1 | 
| 2nd Claim | 2 | - | 2 | 2 | 2 | 2 | 
| 3rd Claim | - | 1 | 3 | 3 | 3 | 3 | 
| 4th Claim | - | 1 | 4 | 4 | 4 | 4 |


* \* These are the primary priority applications
* \*\* Thee are the granted patent applications

We see that the four applications (with application numbers `45019108`, `200880011231`, `08737383`, and `2008000812`) that we assigned to the same family are indeed covering the same claims and thus the same invention. Whereas, if we have used the DOCDB family definition, we would not consider the application in China patent office in the same family as the other three applications in the U.S., EPO, and WO authorities.

## 4.4. Fourth Example: A Family with a priority filing chain of at least length 3!

As the last example, we look into a family with at least **3 levels** of priority filings and we will then compare this with the associated INPADOC and DOCDB families.  
The following query will look for the applications between the years 2002 and 2012 where they are part of a family with at least three levels of priority filings chain. 

In [39]:
%%bigquery df_
WITH t1 AS(
SELECT appln_id, prior_appln_id, family_id, a.appln_auth, inpadoc_family_id, docdb_family_id, appln_filing_year
FROM `usptobias.custom_alg.tls204_PTC`
LEFT JOIN `usptobias.usptobias_dataset.4_family_custom_PCT` USING(appln_id)
LEFT JOIN `usptobias.patstat.tls201` a USING(appln_id)
), t2 AS(
SELECT
  a.appln_id,
  a1.appln_id prior_lvl_1,
  a2.appln_id prior_lvl_2,
  a3.appln_id prior_lvl_3,
  a3.prior_appln_id prior_lvl_4,
  a.family_id,
  a.inpadoc_family_id,
  a.docdb_family_id
FROM t1 a
INNER JOIN t1 a1 ON a.prior_appln_id=a1.appln_id
INNER JOIN t1 a2 ON a1.prior_appln_id=a2.appln_id
INNER JOIN t1 a3 ON a2.prior_appln_id=a3.appln_id
WHERE a.appln_filing_year BETWEEN 2004 AND 2012
)

SELECT appln_id, family_id, inpadoc_family_id, docdb_family_id, COUNT(DISTINCT prior_lvl_4) AS nb_priors
FROM t2
GROUP BY appln_id, family_id, inpadoc_family_id, docdb_family_id
HAVING nb_priors>1
ORDER BY family_id, docdb_family_id, inpadoc_family_id
LIMIT 1000

In [41]:
df_.head(15)

Unnamed: 0,appln_id,family_id,inpadoc_family_id,docdb_family_id,nb_priors
0,159,159,90857,53441578,40
1,288,288,236,40091557,2
2,398,398,316,39790443,2
3,1118,1118,4909299,39530453,14
4,1573,1573,4884518,33514945,5
5,1607,1607,1151,35064622,8
6,1831,1831,1312,35929963,4
7,1939,1939,1392,32870338,3
8,2104,2104,1587073,37899332,10
9,2286,2286,1714,37963286,7


We now choose the first family ID with at least two applications with the above conditions, which is family ID `2553` with INPADOC family ID `1981` and DOCDB family IDs `38067988`.

In [46]:
%%bigquery
SELECT 
    appln_id, appln_auth, appln_nr, family_id, docdb_family_id, inpadoc_family_id, appln_filing_year
FROM (  SELECT *
        FROM `usptobias.custom_family.custom_familyDef`  )
FULL OUTER JOIN( SELECT 
                appln_id, appln_filing_year, appln_nr, docdb_family_id, inpadoc_family_id
            FROM `usptobias.patstat.tls201` ) USING(appln_id)
#FULL OUTER JOIN `usptobias.patstat.tls204` AS t2 USING(appln_id)
WHERE family_id=2553 OR appln_id=2553 OR docdb_family_id=38067988 OR inpadoc_family_id=1981
ORDER BY appln_auth, family_id

Unnamed: 0,appln_id,appln_auth,appln_nr,family_id,docdb_family_id,inpadoc_family_id,appln_filing_year
0,472497,AP,200804518,2553,38067988,1981,2006
1,2759791,AU,2006318349,2553,38067988,1981,2006
2,5019083,CA,2631233,2553,38067988,1981,2006
3,273618140,CN,200680051879,273618140,38067988,1981,2006
4,479452046,CN,201610575891,479452046,38067988,1981,2006
5,57896195,EA,200801468,2553,38067988,1981,2006
6,2553,EP,6844611,2553,38067988,1981,2006
7,320826468,IL,19175908,2553,38067988,1981,2008
8,58061251,JP,2008542481,58061251,38067988,1981,2006
9,406064350,JP,2012267774,406064350,38067988,1981,2012


Since this is a big family and has a complicated network structure, we won't look into their priority filings. We now just compare the custom family ID with the other two (INPADOC and DOCDB families).  
This is a rather large family with 29 members. We will not analyze all of the family members. We only consider the 12 applications that are filed at the USPTO. These applications were all filed by “Marinus Pharmaceuticals Inc” and all of them are in the same INPADOC family. By looking at their DOCDB family members, we see that still 9 applications are considered in the same DOCDB family (docdb_family_id=‘38067988’). However, by looking at their actual publications, we can see that their claims and even their titles are different from each other and thus they do not cover the same invention and should not be considered in the same family. We see that our family definition takes the more conservative approach and assigns a different family ID to each of them.

# 5. Creating Twin Applications Table

As our analysis is based on investigating USPTO applications, we will create a table containing twin applications, where one of the applications is always from the U.S. jurisdiction.
The following query will generate this table using the custom family ID computed in step 4.

In [47]:
family_table='`usptobias.custom_family.family_customDef_2`'
appln_table='`usptobias.patstat.tls201`'

client = bigquery.Client()
# Creating Job Config
job_config = bigquery.QueryJobConfig()
#job_config.dry_run = True
job_config.use_query_cache = False
# Set configuration.query.writeDisposition
job_config.write_disposition = 'WRITE_TRUNCATE'

# Set the destination table
dataset_id = 'custom_family'
table_id = 'twin_appln'
table_ref = client.dataset(dataset_id).table(table_id)
job_config.destination = table_ref
query="""
WITH us_pat AS(
    SELECT a.appln_id, a.appln_auth, b.appln_filing_year, b.granted, a.family_id
    FROM(
        SELECT appln_id, appln_auth, family_id
        FROM {0}
        WHERE appln_auth='US'    
    ) AS a
    INNER JOIN {1} AS b USING(appln_id)

), oth_pat AS(
    SELECT a.appln_id, a.appln_auth, b.appln_filing_year, b.granted, a.family_id
    FROM(
        SELECT appln_id, appln_auth, family_id
        FROM {0}
        WHERE appln_auth<>'US'
    ) AS a
    INNER JOIN {1} AS b USING(appln_id)
)

SELECT 
    a.appln_id AS appln_id_1,
    a.appln_auth AS appln_auth_1,
    a.appln_filing_year AS appln_filing_year_1,
    a.granted AS granted_1,
    b.appln_id AS appln_id_2,
    b.appln_auth AS appln_auth_2,
    b.appln_filing_year AS appln_filing_year_2,
    b.granted AS granted_2,
    b.family_id
FROM us_pat AS a
INNER JOIN oth_pat AS b USING(family_id)
""".format(family_table, appln_table)

# Defining the query
query_job = client.query(query, location='US', job_config=job_config)

query_job.result()

<google.cloud.bigquery.table.RowIterator at 0x266d12932b0>