<a href="https://colab.research.google.com/github/MinKimIP/IPA-public/blob/master/uspto-tm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Obtain USPTO trademark data

In [0]:
%%capture
!pip install pyarrow
!pip install pandas --upgrade --force
!pip install xmltodict
!pip install tqdm

# restart kernel (or runtime) after installation

In [0]:
import io
import os
import sys
import requests
from zipfile import ZipFile
import glob
import json
import xmltodict
import numpy as np
import pandas as pd
print(pd.__version__) # need version 1.0.0 or newer
from pandas import json_normalize
from tqdm import tqdm

1.0.1


In [0]:
# XML-read dataframe handlers

def normalise(df: pd.DataFrame) -> pd.DataFrame:
    for column in df.columns:
        try:
            df = df.reset_index(drop=True)
            df = df.join(json_normalize(df[column]).add_prefix(column)).drop(columns=[column])
        except:
            df = df
    
    df = df.pipe(clean_column_names)
    
    return df


def explode(df: pd.DataFrame) -> pd.DataFrame:
    for column in df.columns:
        try:
            df = df.explode(column)
            df = df.reset_index(drop=True)
        except:
            df = df
    return df


def select_or_create_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    df1 = df.copy()
    before = df.columns
    for column in columns:
        if column not in before:
            df1[column] = np.nan
        else:
            df1[column] = df[column]
    
    df1 = df1[columns]
    
    return df1


def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [x.replace('.', '').replace(':', '').replace('#', '').replace('@', '').replace('-', '_') for x in df.columns]
    return df


def drop_problematic_columns(df: pd.DataFrame, columns: str) -> pd.DataFrame:
    for column in columns:
        if column in df.columns:
            df = df.drop(columns = column)
    return df

In [0]:
# make a list of historical zip files

link_base = 'https://bulkdata.uspto.gov/data/trademark/dailyxml/applications/'
zip_name_base = 'apc18840407-20191231'

zip_files = []

for i in range(1, 66):
  zip_files.append(f"{link_base}{zip_name_base}-{str(i).rjust(2, '0')}.zip")

In [0]:
# Might take a long time. So select a batch you want to process in this session

zip_files_batch = zip_files

['https://bulkdata.uspto.gov/data/trademark/dailyxml/applications/apc18840407-20191231-42.zip', 'https://bulkdata.uspto.gov/data/trademark/dailyxml/applications/apc18840407-20191231-43.zip', 'https://bulkdata.uspto.gov/data/trademark/dailyxml/applications/apc18840407-20191231-44.zip', 'https://bulkdata.uspto.gov/data/trademark/dailyxml/applications/apc18840407-20191231-45.zip']


In [0]:
def is_int(int_string: str) -> bool:
  try:
    int(int_string)
    return True
  except:
    return False


def is_like_yyyymmdd(date_string: str) -> bool:
  if ~is_int(date_string):
    return False
  elif len(date_string) != 8:
    return False
  else:
    yyyy = int(date_string[:4])
    mm = int(date_string[4:6])
    dd = int(date_string[6:])

    valid_year = 1840 < yyyy < 2050
    valid_month = 1 <= mm <= 12
    valid_day = 1 <= dd <= 31

    return valid_year & valid_month & valid_day


def column_exists(df: pd.DataFrame, column: str) -> bool:
  try:
    df[column]
    return True
  except:
    return False


for zip_file in zip_files_batch:
  zip_name = os.path.basename(zip_file).replace('.zip', '')
  print(zip_name)
  r = requests.get(zip_file)
  with ZipFile(io.BytesIO(r.content)) as zf:
    xml_files = [x for x in zf.namelist() if x.endswith('.xml')]
    for xml_file in xml_files:
      with zf.open(xml_file) as xf:
        xml = xf.read()
        xmldict = xmltodict.parse(xml)
        root = xmldict['trademark-applications-daily']['application-information']['file-segments']['action-keys']
        del xmldict
        df = pd.DataFrame.from_dict(root, orient='index').T
        del root
        df = df.pipe(explode).pipe(normalise).pipe(clean_column_names)
        df.columns = [(x.replace('case_file_header', '')
                        .replace('case_filecase_file_event_statements', '')
                        .replace('case_filecase_file_statements', '')
                        .replace('case_fileclassifications', '')
                        .replace('case_filecase_file_owners', '')
                        .replace('case_file_', '')
                        .replace('case_file', '')
                        .replace('madrid_international_filing_requestsmadrid_international_filing_recordmadrid_history_events', '')
                        .replace('madrid_international_filing_requestsmadrid_international_filing_record', '')
                        .replace('foreign_applications', '')
                        .replace('prior_registration_applications', '')
                        .replace('event_statement', 'event_statement_')
                        .replace('classification', 'classification_')
                        .replace('correspondent', 'correspondent_'))
                      for x in df.columns]
        
        application_columns = ['action_key',
                               'serial_number',
                               'registration_number',
                               'transaction_date',
                               'filing_date',
                               'registration_date',
                               'abandonment_date',
                               'status_code',
                               'status_date',
                               'mark_identification',
                               'statementtext',
                               'mark_drawing_code',
                               'trademark_in',
                               'collective_trademark_in',
                               'service_mark_in',
                               'collective_service_mark_in',
                               'collective_membership_mark_in',
                               'certification_mark_in',
                               'cancellation_pending_in',
                               'concurrent_use_in',
                               'foreign_priority_in',
                               'change_registration_in',
                               'intent_to_use_in',
                               'intent_to_use_current_in',
                               'filed_as_use_application_in',
                               'international_registration_number',
                               'international_registration_date']

        subset = ['registration_number'
                  'transaction_date',
                  'filing_date',
                  'registration_date',
                  'abandonment_date',
                  'status_code',
                  'status_date',
                  'mark_identification',
                  'statementtext',
                  'mark_drawing_code',
                  'trademark_in',
                  'collective_trademark_in',
                  'service_mark_in',
                  'collective_service_mark_in',
                  'collective_membership_mark_in',
                  'certification_mark_in',
                  'cancellation_pending_in',
                  'concurrent_use_in',
                  'foreign_priority_in',
                  'change_registration_in',
                  'intent_to_use_in',
                  'intent_to_use_current_in',
                  'filed_as_use_application_in',
                  'international_registration_number',
                  'international_registration_date']

        if column_exists(df, ''):
          application_part_a = (df.copy()
                                .loc[df[''].isna(),:]
                                .pipe(select_or_create_columns, application_columns)
                                .drop_duplicates())
          
          application_part_b = (df.copy()
                                .loc[df[''].notna(), ['action_key', 'serial_number', '']]
                                .pipe(explode).pipe(normalise)
                                .pipe(select_or_create_columns, application_columns)
                                .dropna(subset=subset, how='all')
                                .drop_duplicates())

          application = pd.concat([application_part_a, application_part_b]).drop_duplicates()
          del application_part_a
          del application_part_b
        else:
          application = (df.copy()
                           .pipe(select_or_create_columns, application_columns)
                           .drop_duplicates())
        
        for column in application.columns:
          if 'date' in column:
            is_valid_date = application[column].apply(is_like_yyyymmdd)
            application.loc[~is_valid_date, column] = np.nan
            application[column] = pd.to_datetime(application[column], format = '%Y%m%d')
          elif column.endswith('_in'):
            application[column] = application[column].fillna(False).replace('F', False).replace('T', True)

        application.to_parquet(f'application-{zip_name}.parquet', index=False)
        del application

        classification = (df.copy()
                            .pipe(select_or_create_columns, ['action_key',
                                                             'serial_number',
                                                             'classification_international_code'])
                            .pipe(explode)
                            .drop_duplicates())

        classification.to_parquet(f'classification-{zip_name}.parquet', index=False)
        del classification

        owner_columns = ['action_key',
                         'serial_number',
                         'ownerparty_name',
                         'ownercity',
                         'ownerstate',
                         'ownerpostcode',
                         'ownernationalitystate',
                         'ownernationalitycountry',
                         'ownerentity_statement',
                         'ownercountry']

        if column_exists(df, 'owner'):
          owner_part_a = (df.copy()
                          .loc[df['owner'].notna(), ['action_key', 'serial_number', 'owner']]
                          .pipe(explode).pipe(normalise)
                          .pipe(select_or_create_columns, owner_columns))

          owner_part_b = (df.copy()
                          .loc[df['owner'].isna(),:]
                          .pipe(select_or_create_columns, owner_columns))

          owner = pd.concat([owner_part_a, owner_part_b]).drop_duplicates()
          del owner_part_a
          del owner_part_b
        else:
          owner = (df.copy()
                    .pipe(select_or_create_columns, owner_columns))
        
        owner.to_parquet(f'owner-{zip_name}.parquet', index=False)
        del owner

        del df

apc18840407-20191231-42
apc18840407-20191231-43
apc18840407-20191231-44
apc18840407-20191231-45


KeyboardInterrupt: ignored