In [1]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m635.8 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading numpy-2.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tzdata-202

In [2]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Downloading pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl (40.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-16.1.0


In [3]:
import requests
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import time
 
url = 'https://developer.uspto.gov/ptab-api/proceedings'
 
params = {
    'recordTotalQuantity': '10',
    'recordStartNumber': '0'
}
 
headers = {
    'accept': 'application/json'
}
 
total_records_to_fetch = 100
records_per_request = 10
current_start_number = 0
data_list = []
 
parquet_file = 'ptab_proceedings_key_columns.parquet'
 
# Define the schema for consistency
key_columns = [
    'proceedingNumber',
    'proceedingFilingDate',
    'proceedingStatusCategory',
    'proceedingTypeCategory',
    'respondentPartyName',
    'appellantPartyName'
]
 
schema = pa.schema([
    ('proceedingNumber', pa.string()),
    ('proceedingFilingDate', pa.string()),
    ('proceedingStatusCategory', pa.string()),
    ('proceedingTypeCategory', pa.string()),
    ('respondentPartyName', pa.string()),
    ('appellantPartyName', pa.string())
])
 
try:
    while current_start_number < total_records_to_fetch:
        params['recordStartNumber'] = str(current_start_number)
 
        response = requests.get(url, params=params, headers=headers)
 
        if response.status_code == 200:
            data = response.json()
            print(f"Fetched {len(data['results'])} records starting from index {current_start_number}:")
            data_list.extend(data['results'])
 
            # Convert data to DataFrame
            df = pd.DataFrame(data['results'])
            # Print the column names to identify the correct key columns
            print("Column names in the DataFrame:", df.columns.tolist())
            # Ensure all key columns are present
            for col in key_columns:
                if col not in df.columns:
                    df[col] = None
            # Select the key columns in the correct order
            df_key_columns = df[key_columns]
 
            # Append to Parquet file
            table = pa.Table.from_pandas(df_key_columns, schema=schema)
            if not os.path.exists(parquet_file):
                pq.write_table(table, parquet_file)
            else:
                existing_table = pq.read_table(parquet_file)
                combined_table = pa.concat_tables([existing_table, table])
                pq.write_table(combined_table, parquet_file)
 
            print(f"Appended records starting from index {current_start_number} to {parquet_file}")
 
            current_start_number += records_per_request
 
            # Wait for 30 seconds before fetching the next set of records
            time.sleep(30)
        else:
            print(f"Request failed with status code {response.status_code}")
            print(response.text)
            break
 
except requests.exceptions.RequestException as e:
    print(f"Error with API request: {e}")
 
print(f"Data saved to {parquet_file}")

Fetched 10 records starting from index 0:
Column names in the DataFrame: ['proceedingFilingDate', 'proceedingStatusCategory', 'proceedingNumber', 'proceedingLastModifiedDate', 'proceedingTypeCategory', 'subproceedingTypeCategory', 'respondentTechnologyCenterNumber', 'respondentPartyName', 'respondentGroupArtUnitNumber', 'respondentApplicationNumberText', 'decisionDate', 'appellantTechnologyCenterNumber', 'appellantPatentOwnerName', 'appellantPartyName', 'appellantGroupArtUnitNumber', 'appellantInventorName', 'appellantCounselName', 'appellantApplicationNumberText', 'additionalRespondentPartyDataBag', 'respondentPatentNumber', 'appellantGrantDate', 'appellantPatentNumber', 'respondentPatentOwnerName', 'respondentInventorName', 'respondentGrantDate', 'declarationDate', 'styleNameText', 'petitionerTechnologyCenterNumber', 'petitionerGroupArtUnitNumber', 'secondRespondentPartyName', 'secondRespondentApplNumberText', 'secondRespondentPatentNumber', 'secondRespondentGrantDate', 'secondRespon