In [1]:
import pandas as pd
import numpy as np

from sqlalchemy import create_engine
from sqlalchemy import MetaData, Table
import sqlalchemy as db

from google.cloud import storage
from google.cloud import bigquery

# 1. Loading data

The data is fetched from [Fung Institute Git Repository](https://github.com/funginstitute/downloads).  For full description of the methods and algorithms used for disambiguation, please refere to the following papers:
1. *Balsmeier, Benjamin, et al. **"Automated disambiguation of us patent grants and applications."** Unpublished working paper, Fung Institute for Engineering Leadership (2015).* 
    - [LINK (1) to the Paper](http://people.eecs.berkeley.edu/~gtfierro/papers/AutomatedDisambiguation-of-US-Patent-Grants-and-Applications.pdf) or [LINK (2) to the Paper](https://funginstitute.berkeley.edu/wp-content/uploads/2015/08/AutomatedDisambiguation-of-US-Patent-Grants-and-Applications.pdf), [LINK to the Dataset](https://github.com/funginstitute/downloads)
2. *Balsmeier, Benjamin, et al. **"Machine learning and natural language processing on the patent corpus: Data, tools, and new measures."** Journal of Economics & Management Strategy 27.3 (2018): 535-553.* 
    - [LINK to the Paper](https://funginstitute.berkeley.edu/wp-content/uploads/2016/11/Machine_learning_and_natural_language_processing_on_the_patent_corpus.pdf), [LINK to the Bigquery Dataset](https://console.cloud.google.com/marketplace/details/google_patents_public_datasets/ucb-fung-patent), [Link to the Harvard Dataverse](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/2VBUNY) 

## 1.1. Loading Lawyer Info

In [37]:
engine = create_engine("sqlite:///data-d/lawyer.sqlite3")
print(engine.table_names())

['lawyer']


In [38]:
connection = engine.connect()
result_proxy = connection.execute('SELECT * FROM lawyer')
result_set = result_proxy.fetchall()

df_lawyer = pd.DataFrame(result_set)
df_lawyer.columns = result_set[0].keys()

print('Number of records: {:,}'.format(df_lawyer.shape[0]))
df_lawyer.head(4)

Number of records: 3,303,617


Unnamed: 0,Patent,Firstname,Lastname,LawCountry,OrgName,LawSeq
0,5705896,,,unknown,CUSHMAN DARBY & CUSHMAN IP GROUP OF PILLSBURY ...,0
1,5705444,ALLEN;,D.,unknown,,2
2,5705461,AYLOR;,B.,unknown,,0
3,5705427,JORGENSON;,K.,unknown,,1


In [43]:
print('Number of unique Lawyers: {:,}'.format(df_lawyer.drop_duplicates(subset=['Firstname', 'Lastname', 'OrgName']).shape[0]))

Number of unique Lawyers: 117,367


In [73]:
df_lawyer = df_lawyer.drop(columns=['LawCountry'])
df_lawyer.head(2)

Unnamed: 0,Patent,Firstname,Lastname,OrgName,LawSeq
0,5705896,,,CUSHMAN DARBY & CUSHMAN IP GROUP OF PILLSBURY ...,0
1,5705444,ALLEN;,D.,,2


## 1.2. Loading Patent Info

In [39]:
engine = create_engine("sqlite:///data-d/patent.sqlite3")
print(engine.table_names())

['patent']


In [41]:
connection = engine.connect()
result_proxy = connection.execute("SELECT * FROM patent")
result_set = result_proxy.fetchall()

df_patent = pd.DataFrame(result_set)
df_patent.columns = result_set[0].keys()

print('Number of records: {:,}'.format(df_patent.shape[0]))
df_patent.head(4)

Number of records: 4,823,407


Unnamed: 0,Patent,Kind,Claims,AppType,AppNum,GDate,GYear,AppDate,AppYear,pattype
0,3858241,,6.0,,,1975-01-07,1975,1974-03-26,1974.0,
1,3858242,,5.0,,,1975-01-07,1975,1973-04-16,1973.0,
2,3858243,,12.0,,,1975-01-07,1975,1973-07-11,1973.0,
3,3858244,,1.0,,,1975-01-07,1975,1973-12-17,1973.0,


# 2. Assigning unique ID

In [64]:
df_lawyer_uniq = df_lawyer[['Firstname', 'Lastname', 'OrgName']].drop_duplicates().reset_index(drop=True)
print('Number of unique lawyers: {:,}'.format(df_lawyer_uniq.shape[0]))
df_lawyer_uniq.head(3)

Number of unique lawyers: 117,367


Unnamed: 0,Firstname,Lastname,OrgName
0,,,CUSHMAN DARBY & CUSHMAN IP GROUP OF PILLSBURY ...
1,ALLEN;,D.,
2,AYLOR;,B.,


In [65]:
# Adding Unique ID
df_lawyer_uniq = df_lawyer_uniq.reset_index(drop=False).rename(columns={'index':'lawyer_id', 'Firstname':'name_first', 
                                                                         'Lastname':'name_last', 'OrgName':'organization'})
df_lawyer_uniq['lawyer_id'] = df_lawyer_uniq.lawyer_id + 100000
df_lawyer_uniq.head()

Unnamed: 0,lawyer_id,name_first,name_last,organization
0,100000,,,CUSHMAN DARBY & CUSHMAN IP GROUP OF PILLSBURY ...
1,100001,ALLEN;,D.,
2,100002,AYLOR;,B.,
3,100003,JORGENSON;,K.,
4,100004,,,"SUGHRUE, MION, ZINN, MACPEAK, AND SEAS"


# 3. Merging the "lawyer_id" to the original "df_lawyer" table

In [89]:
# Renaming columns
df_lawyer = df_lawyer.rename(columns={'Firstname':'name_first', 'Lastname':'name_last', 
                                      'OrgName':'organization', 'Patent':'patent_nr', 'LawSeq':'lawyer_seq'})
print('{:,}'.format(df_lawyer.shape[0]))
df_lawyer.head(2)

3,303,617


Unnamed: 0,patent_nr,name_first,name_last,organization,lawyer_seq
0,5705896,,,CUSHMAN DARBY & CUSHMAN IP GROUP OF PILLSBURY ...,0
1,5705444,ALLEN;,D.,,2


In [94]:
df_lawyer_f = pd.merge(df_lawyer, df_lawyer_uniq, on=['name_first', 'name_last', 'organization'], how='left')\
        .drop(columns=['name_first', 'name_last', 'organization'])

df_lawyer_f = df_lawyer_f[['patent_nr', 'lawyer_id', 'lawyer_seq']]
print('{:,}'.format(df_lawyer_f.shape[0]))
df_lawyer_f.head(2)

3,303,617


Unnamed: 0,patent_nr,lawyer_id,lawyer_seq
0,5705896,100000,0
1,5705444,100001,2


# 3. Saving the results to Google Storage Bucket

In [98]:
# Writing tables to csv files
df_lawyer_uniq.to_csv('./data-d/lawyer_id_fung.csv', encoding='utf-8', index=False)

df_lawyer_f.to_csv('./data-d/lawyer_patent_fung.csv', encoding='utf-8', index=False)

In [99]:
storage_client = storage.Client()

# Uploading "lawyer_patent_fung.csv" to the Storage bucket
bucket_name = 'uspto-data'
dest_file_name = 'lawyer_patent_fung.csv'
source_file_name = './data-d/lawyer_patent_fung.csv'

bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(dest_file_name)
blob.upload_from_filename(source_file_name)

# Uploading "lawyer_id_fung.csv" to the Storage bucket
bucket_name = 'uspto-data'
dest_file_name = 'lawyer_id_fung.csv'
source_file_name = './data-d/lawyer_id_fung.csv'

bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(dest_file_name)
blob.upload_from_filename(source_file_name)

# 4. Creating a Bigquery table

In [101]:
# Creating "lawyer_id_fung" table
bq_client = bigquery.Client()

schema = [
    bigquery.SchemaField('lawyer_id', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_first', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('name_last', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('organization', 'STRING', 'NULLABLE', None, ())
]

dataset_id = 'adding_data'
dataset_ref = bq_client.dataset(dataset_id)
dest_table_name = '15_lawyer_id_fung'

job_config = bigquery.LoadJobConfig()
job_config.schema = schema
job_config.skip_leading_rows = 1
job_config.source_format = bigquery.SourceFormat.CSV
uri = "gs://uspto-data/lawyer_id_fung.csv"

load_job = bq_client.load_table_from_uri(
    uri, dataset_ref.table(dest_table_name), job_config=job_config
)  
print("Starting job {}".format(load_job.job_id))

load_job.result()
print('Job has finished!')

Starting job c618b0f1-3551-4976-9b6f-799371298309
Job has finished!


In [102]:
# Creating "lawyer_patent_fung" table
bq_client = bigquery.Client()

schema = [
    bigquery.SchemaField('patent_nr', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('lawyer_id', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('lawyer_seq', 'INTEGER', 'NULLABLE', None, ())
]

dataset_id = 'adding_data'
dataset_ref = bq_client.dataset(dataset_id)
dest_table_name = '15_lawyer_patent_fung'

job_config = bigquery.LoadJobConfig()
job_config.schema = schema
job_config.skip_leading_rows = 1
job_config.source_format = bigquery.SourceFormat.CSV
uri = "gs://uspto-data/lawyer_patent_fung.csv"

load_job = bq_client.load_table_from_uri(
    uri, dataset_ref.table(dest_table_name), job_config=job_config
)  
print("Starting job {}".format(load_job.job_id))

load_job.result()
print('Job has finished!')

Starting job b3a7e078-65d0-4aa3-913b-be8e0434c104
Job has finished!


# Custom Disambiguation: Using Standard Disambiguation rules

## Getting the table of raw lawyer names from PAIR

In [7]:
client = bigquery.Client()
job_config = bigquery.QueryJobConfig()
job_config.use_query_cache = False
job_config.write_disposition = 'WRITE_TRUNCATE'

# Set Destination
dataset_id = 'adding_data'
table_id = '15_rawlawyer_PAIR'
table_ref = client.dataset(dataset_id).table(table_id)

job_config.destination = table_ref

query="""
WITH t1 AS(
SELECT *
FROM(
    SELECT UPPER(REGEXP_REPLACE(
                                REGEXP_REPLACE(
                                              REGEXP_REPLACE(correspondence_name_line_1, r'[^a-zA-Z\s]+', ''), 
                                              r'[\s]+', ' '),
                                r'(^\s+)|(\s+$)', ''
                                ) 
                ) AS lawyer,
            correspondence_name_line_1 AS raw_lawyer
    FROM `patents-public-data.uspto_oce_pair.correspondence_address` 
    GROUP BY correspondence_name_line_1 
)
GROUP BY raw_lawyer, lawyer
ORDER BY lawyer DESC
)

SELECT *
FROM t1
"""

query_job = client.query(query, location='US', job_config=job_config)
print('Query job has {} started!'.format(query_job.job_id))
query_job.result()
print('Job has finished!')

Query job has 31c3a50e-adf8-458c-a244-66d50245775b started!
Job has finished!


In [8]:
# Exctracting table
client = bigquery.Client()

# Set Source table
project_id = 'usptobias'
dataset_id = 'adding_data'
table_id = '15_rawlawyer_PAIR'
table_ref = client.dataset(dataset_id, project=project_id).table(table_id)

# Set Destination
dest_bucket = 'uspto-data'
dest_file_name = '15_rawlawyer_PAIR.csv'
dest_uri = "gs://{}/{}".format(dest_bucket, dest_file_name)

extract_job = client.extract_table(table_ref, dest_uri, location='US')
print('Extract job has {} started!'.format(extract_job.job_id))
extract_job.result()
print('Job has finished and table {} has been exported to {} bucket!'.format(dest_file_name, dest_bucket))

Extract job has a0288634-f589-4770-b348-2dfc1f40e34e started!
Job has finished and table 15_rawlawyer_PAIR.csv has been exported to uspto-data bucket!


# Loading Saved table

In [316]:
df_rawlawyer = pd.read_csv('./data-d/15_rawlawyer_PAIR.csv', low_memory=False)
print('Number of records: {:,}'.format(df_rawlawyer.shape[0]))
df_rawlawyer.head(2)

Number of records: 383,806


Unnamed: 0,lawyer,raw_lawyer
0,ZVI BEKERMAN,ZVI BEKERMAN
1,ZOE D ZIAKA AND,ZOE D. ZIAKA AND


In [328]:
# Replacing all characters except alphabetics
df_rawlawyer.lawyer = df_rawlawyer.raw_lawyer.str.replace("[^a-zA-Z]+", " ", regex=True).str.upper()

In [332]:
# Adding trailing and ending space (for using in the rule-based disambiguation)
df_rawlawyer = df_rawlawyer.dropna()
df_rawlawyer.lawyer = df_rawlawyer.lawyer.apply(lambda x: " " + x + " ")
df_rawlawyer.shape

(383743, 2)

# Loading Standardization rules

This rules has been downloaded from the following link:  
https://sites.google.com/site/patentdataproject/Home/posts/namestandardizationroutinesuploaded

In [34]:
import os
import re
import json
import chardet
import codecs

In [333]:
from zipfile import ZipFile
with ZipFile('./data-d/stdname.zip', 'r') as file_ref:
    file_ref.extractall('./data-d/stdname/')

In [None]:
data_folder = './data-d/stdname/'
files = sorted(os.listdir(data_folder))

In [335]:
pattern = r'^.*\"(.*?)\".*?\"(.*?)\"'
std_mapper = dict()

decoding = [(2, 1), 
            (1, 2), 
            (1, 2), 
            (2, 1), 
            (1, 2), 
            (1, 2)]

for dec, file in zip(decoding, files):
    encoding = chardet.detect(open(data_folder+file, "rb").read())['encoding']
    with codecs.open(data_folder+file, 'r', encoding=encoding) as text_file:
        lines = text_file.readlines()
        for line in lines:
            key = (re.match(pattern, line)[dec[0]]).rstrip()
            value = (re.match(pattern, line)[dec[1]]).rstrip()
            std_mapper[key] = value

In [336]:
df_mapper = pd.DataFrame(std_mapper, index=['mapped']).T.reset_index(drop=False).rename(columns={'index':'initial'})
df_mapper.mapped = ' '
df_mapper.initial = df_mapper.initial.apply(lambda x: x+' ')
std_mapper = df_mapper.dropna().set_index('initial')['mapped'].to_dict()

df_mapper.head(3)

Unnamed: 0,initial,mapped
0,& BRO,
1,& BROTHER,
2,& C,


In [338]:
import time
start_t = time.perf_counter()
df_rawlawyer.lawyer = df_rawlawyer.lawyer.replace(std_mapper, regex=True).replace(std_mapper, regex=True)
end_t = time.perf_counter()
diff_t = end_t - start_t
print('Total running time was {:,.0f} hours and {:.0f} minutes!'.format(diff_t//3600, (diff_t%3600)//60))

Total running time was 0 hours and 28 minutes!


In [342]:
# Strip the space
df_rawlawyer.lawyer = df_rawlawyer.lawyer.str.strip()

In [344]:
df_rawlawyer_b = df_rawlawyer.copy()

In [346]:
# Getting unique disambiguated lawyers
df_lawyer_id = df_rawlawyer[['lawyer']].drop_duplicates().reset_index(drop=True).copy()
# Adding unique ID to each lawyer
df_lawyer_id = df_lawyer_id.reset_index(drop=False).rename(columns={'index':'lawyer_id'})
df_lawyer_id.lawyer_id = df_lawyer_id.lawyer_id + 100000

print('Number of unique lawyers: {:,}'.format(df_lawyer_id.shape[0]))
df_lawyer_id.head(2)

Number of unique lawyers: 263,553


Unnamed: 0,lawyer_id,lawyer
0,100000,ZVI BEKERMAN
1,100001,ZOE D ZIAKA AND


In [358]:
df_lawyer_merger = pd.merge(df_rawlawyer, df_lawyer_id, on=['lawyer'], how='left')
print('Number of records: {:,}'.format(df_lawyer_merger.shape[0]))
df_lawyer_merger.head(3)

Number of records: 383,743


Unnamed: 0,lawyer,raw_lawyer,lawyer_id
0,ZVI BEKERMAN,ZVI BEKERMAN,100000
1,ZOE D ZIAKA AND,ZOE D. ZIAKA AND,100001
2,ZIV KEDEM,Ziv Kedem,100002


In [360]:
# Saving the results
df_lawyer_id.to_csv('./data-d/lawyer_id_PAIR.csv', encoding='utf-8', index=False)
df_lawyer_merger.to_csv('./data-d/lawyer_merger_PAIR.csv', encoding='utf-8', index=False)

## Creating Bigquery Tables

In [365]:
# Creating "lawyer_id_fung" table
bq_client = bigquery.Client()

schema = [
    bigquery.SchemaField('attorney_id', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('attorney', 'STRING', 'NULLABLE', None, ())
]

dataset_id = 'adding_data'
dataset_ref = bq_client.dataset(dataset_id)
dest_table_name = '15_lawyer_id_PAIR'

job_config = bigquery.LoadJobConfig()
job_config.schema = schema
job_config.skip_leading_rows = 1
job_config.source_format = bigquery.SourceFormat.CSV
uri = "gs://uspto-data/lawyer_id_PAIR.csv"

load_job = bq_client.load_table_from_uri(
    uri, dataset_ref.table(dest_table_name), job_config=job_config
)  
print("Starting job {}".format(load_job.job_id))

load_job.result()
print('Job has finished!')

Starting job 812188be-538b-45b0-92b7-a674e4d0bc21
Job has finished!


In [366]:
# Creating "lawyer_id_fung" table
bq_client = bigquery.Client()

schema = [
    bigquery.SchemaField('attorney', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('raw_attorney', 'STRING', 'NULLABLE', None, ()),
    bigquery.SchemaField('attorney_id', 'STRING', 'NULLABLE', None, ())
]

dataset_id = 'adding_data'
dataset_ref = bq_client.dataset(dataset_id)
dest_table_name = '15_lawyer_merger_PAIR'

job_config = bigquery.LoadJobConfig()
job_config.schema = schema
job_config.skip_leading_rows = 1
job_config.source_format = bigquery.SourceFormat.CSV
uri = "gs://uspto-data/lawyer_merger_PAIR.csv"

load_job = bq_client.load_table_from_uri(
    uri, dataset_ref.table(dest_table_name), job_config=job_config
)  
print("Starting job {}".format(load_job.job_id))

load_job.result()
print('Job has finished!')

Starting job 8a7adb46-68f7-4cec-980c-97e07f5c9cd0
Job has finished!


## Creating final `appln_lawyer` table

In [381]:
client = bigquery.Client()
job_config = bigquery.QueryJobConfig()
job_config.use_query_cache = False
job_config.write_disposition = 'WRITE_TRUNCATE'

# Set Destination
dataset_id = 'adding_data'
table_id = '15_appln_lawyer'
table_ref = client.dataset(dataset_id).table(table_id)

job_config.destination = table_ref

query="""
WITH rlawyerAppln_table AS(
SELECT 
    application_number AS appln_nr,
    correspondence_name_line_1 AS raw_attorney,
    correspondence_region_code AS attorney_region_code,
    correspondence_country_code AS attorney_country_code
FROM `patents-public-data.uspto_oce_pair.correspondence_address`
), lawyerMerger_table AS(
    SELECT attorney, raw_attorney, attorney_id
    FROM `usptobias.adding_data.15_lawyer_merger_PAIR`
)

SELECT appln_nr, attorney, attorney_id, attorney_region_code, attorney_country_code
FROM rlawyerAppln_table
LEFT JOIN lawyerMerger_table USING(raw_attorney)
WHERE attorney IS NOT NULL
"""

query_job = client.query(query, location='US', job_config=job_config)
print('Query job has {} started!'.format(query_job.job_id))
query_job.result()
print('Job has finished!')

Query job has 6bfcd78b-eb10-47d2-8d72-1783e8c3fb4a started!
Job has finished!


# Using Vecotization to find the most frequent words in the lawyers list

In [348]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(lowercase=True, stop_words="english")

matrix_sp = vectorizer.fit_transform(df_lawyer_id.lawyer)
# Creating Bag of Words Sparse Dataframe
df_bow_sp = pd.SparseDataFrame(matrix_sp, default_fill_value=0, columns=vectorizer.get_feature_names())
print('Number of rows: {:,}\t and columns: {:,}'.format(df_bow_sp.shape[0], df_bow_sp.shape[1]))

Number of rows: 263,553	 and columns: 108,837


In [None]:
# Printing the 30-most frequent words
df_freq = pd.DataFrame(vectorizer.vocabulary_, index=['frequency']).T
df_freq = df_freq.reset_index(drop=False).rename(columns={'index':'words'}).sort_values(by=['frequency'], ascending=False).reset_index(drop=True)

In [349]:
df_freq = df_bow_sp.sum(axis=0).to_dense()
df_freq.shape

(108837,)

In [350]:
df_freq_2 = df_freq.sort_values(ascending=False)
df_freq_2 = df_freq_2.reset_index(drop=False).rename(columns={'index':'words', 0:'frequency'})
df_freq_2.head(3)

Unnamed: 0,words,frequency
0,esq,12658
1,john,7872
2,robert,6378


In [243]:
import spacy
nlp = spacy.load("en")

def apply_ent(x):
    #print(x.capitalize())
    doc = nlp(u"{}".format(x.capitalize()))
    try:
        #print('********* {}'.format(doc.ents[0].label_))
        if doc.ents[0].label_ in ['PERSON', 'ORG', 'GPE', 'PRODUCT']:
            return pd.Series([1], index=['is_person'])
        else:
            return pd.Series([0], index=['is_person'])
    except:
        return pd.Series([0], index=['is_person'])

In [244]:
df_freq_2['is_person'] = df_freq_2.words.apply(lambda x: apply_ent(x))

## Creating Bigquery tables

AttributeError: 'Client' object has no attribute 'job_config'

# Using Google Knowledge Graph API for disambiguation

In [163]:
from tqdm import tqdm_notebook

In [206]:
import json
import urllib

api_key = "AIzaSyC1QAvCLkrwmlSkG8GxYmOqiHobTrTTjo0"
#query = 'Taylor Swift'
service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
def gkg_req(query, pbar):
    params = {
        'query': query,
        'limit': 3,
        'indent': True,
        'key': api_key,
    }
    err_flag = 0
    url = service_url + '?' + urllib.parse.urlencode(params)
    
    try:
        response = json.loads(urllib.request.urlopen(url).read())
    except:
        time.sleep(3)
        try:
            response = json.loads(urllib.request.urlopen(url).read())
        except:
            err_flag=1
    try:
        id_r = response['itemListElement'][0]['result']['@id']
        name_r = response['itemListElement'][0]['result']['name']
        type_r = response['itemListElement'][0]['result']['@type']
        score_r = response['itemListElement'][0]['resultScore']
    except:
        id_r = np.nan
        name_r = query
        type_r = []
        score_r = np.nan
    df = pd.Series(dict(ent_name=name_r, ent_id=id_r, ent_type=type_r, ent_score=score_r, err_flag=err_flag))
    
    pbar.update(1)
    return df
#for element in response['itemListElement']:
#      print(element['result']['name'] + ' (' + str(element['resultScore']) + ')')

In [212]:
df_gkg = pd.DataFrame(columns=['ent_name', 'ent_id', 'ent_type', 'ent_score', 'err_flag'])

In [None]:
batch = 1000
N = 100000
idx_start = df_gkg.shape[0]

start_T = time.perf_counter()
pbar = tqdm_notebook(total=(N-idx_start))

for i in range(idx_start,N,batch):
    df_gkg = df_gkg.append(df_rawlawyer.iloc[i:i+batch].apply(lambda x: gkg_req(x.loc['raw_lawyer'], pbar), axis=1))
    time.sleep(max(0, int((batch+i)*100.0/17000)-(time.perf_counter()-start_T)))
_ = pbar.close()
df_gkg

In [305]:
df_gkg.to_csv('./data-d/data_gkg_lawyer.csv', encoding='utf-8', index=False)

In [251]:
set_opt = set()
for idx, row in df_gkg.iloc[:10000].iterrows():
    set_opt = set_opt.union(set(row['ent_type']))