<a href="https://colab.research.google.com/github/MinKimIP/IPA-public/blob/master/data_request/2019-12-19%20Geelong.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Request

19 December 2019

Hi IP Australia,

Just wondering what data is available at the local government are level regarding patent data?

I have seen some data for Greater Geelong for 2011 but was hoping for something more recent.

Thanks for your help.

Kind regards

---

Subsequent engagement: also include designs and trade marks as well

This data request can be answered using [IPGOD 2019](https://data.gov.au/data/dataset/intellectual-property-government-open-data-2019).

In [0]:
import pandas as pd

# data sources

def ip_data(ip_type, table):
    url_base = 'https://data.gov.au/data/dataset/a4210de2-9cbb-4d43-848d-46138fefd271/resource/'
    url = {'patent': {'process': '8fa6db74-a461-47f1-acc6-2e0cf7f06bd5/download/ipgod107.csv',
                      'applicant': '846990df-db42-4ad7-bbd6-567fd37a2797/download/ipgod102.csv',
                      'classification': '5aeec421-dddc-4c22-a66a-bfc5ad22947f/download/ipgod104.csv'},
           'trademark': {'process': '4dec358e-14ff-45ef-8b3e-b27274347e23/download/ipgod203.csv',
                         'applicant': 'aae1c14d-f8c0-4540-b5d3-1ed21500271e/download/ipgod202.csv',
                         'classification': 'fb505762-ab2a-4f56-999d-9bedd1da2ad5/download/ipgod204.csv'},
           'design': {'process': '9003a068-82fd-410d-a193-d54b8bc1f171/download/ipgod303.csv',
                      'applicant': '4b802e80-c667-4b84-8f50-72c2624c59c1/download/ipgod302.csv',
                      'classification': 'b01f7e00-a718-4e2d-9ffb-14938fd7dba9/download/ipgod304.csv'}}
    
    df = pd.read_csv(url_base+url[ip_type][table], low_memory=False)
    df = parse_dates(df)
    return df


main_key = {'patent': 'australian_appl_no',
            'trademark': 'tm_number',
            'design': 'application_id'}


# pipe components

def parse_dates(df):
    for column in df.columns:
        if "date" in column:
            df[column] = pd.to_datetime(df[column])
    return df


def relevant_applicant_data(ip_type):
    df = (ip_data(ip_type, 'applicant')
            .assign(is_in_geelong=lambda x: x['lga_name'].fillna('').str.lower()
                                                         .str.contains('geelong'))
            .query('is_in_geelong')
            [[main_key[ip_type], 'ipa_id', 'name', 'abn', 'lga_name']])
    return df


def relevant_process_data(ip_type):
    relevant_columns = {'patent': ['australian_appl_no',
                                   'patent_type',
                                   'application_date',
                                   'sealing_date'],
                        'trademark': ['tm_number',
                                      'type_of_mark_code',
                                      'lodgement_date',
                                      'registered_date'],
                        'design': ['application_id',
                                   'lodgement_date',
                                   'registration_date']}
    df = (ip_data(ip_type, 'process')
            [relevant_columns[ip_type]])
    if ip_type == 'design':
        df['type'] = ''
    return df


def relevant_classification_data(ip_type):
    df = ip_data(ip_type, 'classification')
    relevant_columns = {'patent': ['australian_appl_no', 'ipc_mark_value'],
                        'trademark': ['tm_number', 'class_code'],
                        'design': ['application_id', 'class_code']}
    if ip_type == 'patent':
        df = df.query('ipc_mark_type_code=="First (ie Primary)"')
    elif ip_type == 'design':
        df = df.query('primary_class_code_ind')
    df = df[relevant_columns[ip_type]]
    return df


def rename_columns(df, ip_type):
    column_rename_dict = {'patent': {'australian_appl_no': 'application_number',
                                     'patent_type': 'application_type',
                                     'ipc_mark_value': 'classification',
                                     'sealing_date': 'granted_date',
                                     'name': 'applicant_name'},
                          'trademark': {'tm_number': 'application_number',
                                        'type_of_mark_code': 'application_type',
                                        'class_code': 'classification',
                                        'lodgement_date': 'application_date',
                                        'registered_date': 'granted_date',
                                        'name': 'applicant_name'},
                          'design': {'application_id': 'application_number',
                                     'type': 'application_type',
                                     'class_code': 'classification',
                                     'lodgement_date': 'application_date',
                                     'registration_date': 'granted_date',
                                     'name': 'applicant_name'}}
    df = df.rename(columns = column_rename_dict[ip_type])
    return df


def create_type_column(df, ip_type):
    df['ip_type']=ip_type
    return df


def reorder_columns(df):
    ordered_columns = ['ip_type',
                       'application_type',
                       'application_number',
                       'application_date',
                       'granted_date',
                       'classification',
                       'lga_name',
                       'ipa_id',
                       'abn',
                       'applicant_name']
    df = df[ordered_columns]
    return df


# pipeline

def relevant_data(ip_type):
    df = (relevant_applicant_data(ip_type)
             .merge(relevant_process_data(ip_type), on=main_key[ip_type], how='left')
             .merge(relevant_classification_data(ip_type), on=main_key[ip_type], how='left')
             .drop_duplicates()
             .pipe(rename_columns, ip_type)
             .pipe(create_type_column, ip_type)
             .pipe(reorder_columns))
    return df

csv_preference = {'index': False,
                  'encoding': 'utf-8',
                  'date_format': '%Y-%m-%d',
                  'float_format': '%.0f'}

In [0]:
df = pd.concat([relevant_data('patent'),
                relevant_data('trademark'),
                relevant_data('design')])

In [0]:
df.to_csv('geelong_ip_data.csv', **csv_preference)