# Configuration

In [None]:
!pip3 install --upgrade httplib2
!pip3 install --upgrade requests
!pip3 install nltk
!pip3 install pandas
!pip3 install regex
!pip3 install tensorflow

In [None]:
!pip3 install --user --upgrade httplib2
!pip3 install --user --upgrade requests
!pip3 install --user nltk
!pip3 install --user pandas
!pip3 install --user regex
!pip3 install --user tensorflow

In [None]:
!pip3 install --upgrade google-cloud-storage

In [None]:
!pip3 install --upgrade google-cloud-bigquery

In [None]:
!pip3 install pandas

In [1]:
import pandas as pd
import json
import numpy as np
import os
import re
import regex
import time
import math
from google.cloud import bigquery
from google.cloud import storage
from io import BytesIO, StringIO
import nltk
from pandas.io import gbq
from tensorflow.python.lib.io import file_io

In [2]:
#folder_json_all = 'gs://pdf-processing-219114/patents_train_fprost/json'
folder_json_all = 'gs://pdf-processing-219114/patents_all_english/json'

In [3]:
SERVICE_ACCOUNT_PATH = os.path.join(
        os.getcwd(),
        '../',
        '0_key/pdf-processing-apikey2.json')
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = SERVICE_ACCOUNT_PATH

# Pull OCR data

In [4]:
import ast
import logging
from tensorflow.python.lib.io import file_io

logger = logging.getLogger(__name__)

def vis_b_v1p2_json_path_to_text(json_path):
  """get text from a beta v1p2 vision api document text detection json

  get document text from a google cloud vision API v1p2 document text detection
    call output json, where pages are joined by newlines. return a string.

  Args:
    json_path: gs:// uri or local file path to a .json file created by
      GCP vision API beta v1p2 document text extraction

  Returns:
    string of text from the .json, where pages are joined with /n
  """

  logger.debug('reading json file %s', json_path)

  raw_json = file_io.read_file_to_string(json_path)
  dict_json = ast.literal_eval(raw_json)  # json to dict
  full_text = []

  for response in dict_json['responses']:
    if 'fullTextAnnotation' in response.keys():
      if 'text' in response['fullTextAnnotation'].keys():
        full_text.append(response['fullTextAnnotation']['text'])
      else:
        logger.debug('%s has a fullTextAnnotation entry without a text field',
                     json_path)
        full_text.append('')
    else:
      logger.debug('%s has a response with no fullTextAnnotation field',
                   json_path)
      full_text.append('')

  full_text = '\n'.join(full_text)
  return full_text

In [5]:
def get_content(gcs_path):
    match = re.match(r'gs://([^/]+)/(.+)', gcs_path) # split bucket/path
    bucket_name = match.group(1)
    prefix = match.group(2)
    client = storage.Client()
    content_dict = {}  
    bucket = client.get_bucket(bucket_name)

    blob_list = list(bucket.list_blobs(prefix=prefix))
    for blob in blob_list:
        gcs_uris_list = []
        JSON_PATH = os.path.join('gs://', blob.bucket.name, blob.name)
        content_dict[JSON_PATH] = vis_b_v1p2_json_path_to_text(JSON_PATH)
    return content_dict     

In [6]:
content_dict = get_content(folder_json_all)

pdf_df = pd.DataFrame.from_dict(content_dict, orient='index').reset_index()
pdf_df.columns = ['pdf_uri', 'text']

In [7]:
pdf_df.shape

(196, 2)

In [8]:
pdf_df['filename'] = pdf_df['pdf_uri'].map(lambda x: os.path.basename(x))

In [9]:
pdf_df.head()

Unnamed: 0,pdf_uri,text,filename
0,gs://pdf-processing-219114/patents_all_english...,US010136408B1\n(12) United States Patent\nGree...,us_001.output-1-to-1.json
1,gs://pdf-processing-219114/patents_all_english...,US010143012B2\n(12) United States Patent\nStat...,us_047.output-1-to-1.json
2,gs://pdf-processing-219114/patents_all_english...,Europäisches\nPatentamt\nEuropean\nPatent Offi...,Espacenet_En51.output-1-to-1.json
3,gs://pdf-processing-219114/patents_all_english...,Europäisches\nPatentamt\nEuropean\nPatent Offi...,Espacenet_En38.output-1-to-1.json
4,gs://pdf-processing-219114/patents_all_english...,US010142921B2\n(12) United States Patent\nMcCa...,us_078.output-1-to-1.json


# Extract answer key

In [10]:
truth_path = "gs://input_data_pdf/truth_data_feb182019.csv"

In [11]:
def get_bucket_blob(full_path):
    match = re.match(r'gs://([^/]+)/(.+)', full_path)
    bucket_name = match.group(1)
    blob_name = match.group(2)
    return bucket_name, blob_name

def download_string(full_path, service_account):
  storage_client = storage.Client.from_service_account_json(service_account)
  bucket_name, path = get_bucket_blob(full_path)
  bucket = storage_client.get_bucket(bucket_name)
  blob = bucket.blob(path)
  byte_stream = BytesIO()
  blob.download_to_file(byte_stream)
  byte_stream.seek(0)
  return byte_stream

In [12]:
truth_byte_stream = download_string(truth_path, SERVICE_ACCOUNT_PATH)
df_truth = pd.read_csv(truth_byte_stream, encoding='utf-8')

In [13]:
found = 0
not_found =0
for _file in list(pdf_df['pdf_uri']):
    _file = os.path.basename(_file).replace('.output-1-to-1.json', '.pdf')
    if _file not in list(df_truth['file_name']):
        print (_file)
        not_found += 1
    else:
        found += 1

In [14]:
print((float(not_found)/(not_found + found)))
print(float(found))

0.0
196.0


# Retrieve relevant fields

In [15]:
df_truth.head()

Unnamed: 0,file_name,file_type,publication_date_dirty,publication_date,classification_1_dirty,classification_1,classification_2_dirty,classification_2,application_number_dirty,application_number,filing_date,priority,representative,applicant,inventor,titleFL,titleSL,abstractFL,publication_number
0,us_001.pdf,US,"Nov. 20, 2018","Nov. 20, 2018",H04W 64/00,H04W 64/00,H04W 64/003,H04W 64/003,679694,679694,"Aug. 17, 2017",,,Colby Green,Colby Green,DETERMINING HIGH VALUE,GEOGRAPHIC LOCATIONS,The present invention is a method and system o...,10136408
1,us_002.pdf,US,"Nov. 20, 2018","Nov. 20, 2018",H04W 36/18,H04W 36/18,H04W 36/0022,H04W 36/0022,599409,599409,"May. 18, 2017",,,"Essential Products, Inc.",Mara Clair Segal,MEDIA AND COMMUNICATIONS IN A,CONNECTED ENVIRONMENT,Switching the providing of a conversation amon...,10136364
2,us_003.pdf,US,"*Nov. 20, 2018","Nov. 20, 2018",H04W 12/12,H04W 12/12,H04W 12/12,H04W 12/12,920213,920213,"Mar. 13, 2018",,,United Services Automobile,Amanda S. Fernandez,LOCATION VERIFICATION BASED ON,ENVIRONMENTAL SENSOR DATA,Techniques are described for determining and/o...,10136327
3,us_004.pdf,US,"Nov. 20, 2018","Nov. 20, 2018",H04W 40/02,H04W 40/02,H04W 12/06,H04W 12/06,628883,628883,"Jun. 21, 2017",,,AT&T INTELLECTUAL,Paul R. Hancock,AUTHENTICATION DEVICE SELECTION,TO FACILITATE AUTHENTICATION VIA AN,Steering an authentication request to a determ...,10136318
4,us_005.pdf,US,"Nov. 20, 2018","Nov. 20, 2018",H04W 4/80,H04W 4/80,H04W 4/80,H04W 4/80,625786,625786,"Jun. 16, 2017",,,MICROSOFT TECHNOLOGY,Dikla Dotan-Cohen,SIGNAL SHARING BETWEEN TRUSTED,GROUPS OF DEVICES,Aspects of the technology described herein ide...,10136290


In [16]:
pdf_df.head()

Unnamed: 0,pdf_uri,text,filename
0,gs://pdf-processing-219114/patents_all_english...,US010136408B1\n(12) United States Patent\nGree...,us_001.output-1-to-1.json
1,gs://pdf-processing-219114/patents_all_english...,US010143012B2\n(12) United States Patent\nStat...,us_047.output-1-to-1.json
2,gs://pdf-processing-219114/patents_all_english...,Europäisches\nPatentamt\nEuropean\nPatent Offi...,Espacenet_En51.output-1-to-1.json
3,gs://pdf-processing-219114/patents_all_english...,Europäisches\nPatentamt\nEuropean\nPatent Offi...,Espacenet_En38.output-1-to-1.json
4,gs://pdf-processing-219114/patents_all_english...,US010142921B2\n(12) United States Patent\nMcCa...,us_078.output-1-to-1.json


In [17]:
import abc

class MatchFunction(object):
    
    @abc.abstractmethod
    def __init__(self, kwargs):
        return
    
    @abc.abstractmethod
    def find_match(self, pdf_text, search_value):
        # A match function should return a start position and the matched string.
        return


class GeneralMatch(MatchFunction):    
    def __init__(self):
        pass

    def find_match(self, pdf_text, search_value):
        pdf_text = pdf_text.lower()
        search_value = search_value.lower()
        
        match = re.search(re.compile(search_value), pdf_text)
        if match:
            start_index = pdf_text.find(match.group(0))
            return start_index, match.group(0)

        
class MatchClassification(MatchFunction):
    def __init__(self):
        pass
    
    def find_match(self, pdf_text, search_value):
        pdf_text = pdf_text.lower()
        search_value = search_value.lower()

        search_value = search_value.replace('0', '[0|o|q]')    # To handle OCR issues (0 -> O or Q)
        search_value = search_value.replace('7', '[7|z]')      # To handle OCR issues (7 -> Z)
        search_value = search_value.replace('6', '[6|o]')      # To handle OCR issues (6 -> O)
        search_value = search_value.replace('q ', '[q|0]\s')   # To handle OCR issues (Q_ -> 0_)      
        search_value = search_value.replace('/', '[/|1|7|\.]') # To handle OCR issues (/ -> 1 or 7 or .)

        match = re.search(re.compile(search_value), pdf_text)
        if match:
            start_index = pdf_text.find(match.group(0))
            return start_index, match.group(0)

assert MatchClassification().find_match('H04W 48/18', 'H04W 48/18')[1] == 'H04W 48/18'.lower()
assert MatchClassification().find_match('H04W 48118', 'H04W 48/18')[1] == 'H04W 48118'.lower()


class MatchClassification_v2(MatchFunction):
    """Matching function specific to classification_1.
    
    This field is more complicated as it is sometimes equal to classification 2
      or a substring of classification_2, therefore a simple match might give the wrong
      location.
    
    When we get several match, we return the first match after the position of IntCl
      for classification 1 and the the position after US CL for classification 2.
      
    Note: we use either 'Int\.? C[I|L|1]' or 'U.S. C[I|L|1]' as keyword pattern.
    """
    
    def __init__(self, pattern_keyword_before):
        self._pattern_keyword_before = pattern_keyword_before.lower()
    
    def _find_position_pattern(self, pdf_text):
        match = re.search(re.compile(self._pattern_keyword_before), pdf_text)
        if match:
            start_index = pdf_text.find(match.group(0))
            return start_index
    
    def find_match(self, pdf_text, search_value):
        
        pdf_text = pdf_text.lower()
        search_value = search_value.lower()
        search_value = search_value.replace('0', '[0|o|q]')    # To handle OCR issues (0 -> O or Q)
        search_value = search_value.replace('7', '[7|z]')      # To handle OCR issues (7 -> Z)
        search_value = search_value.replace('6', '[6|o]')      # To handle OCR issues (6 -> O)
        search_value = search_value.replace('q ', '[q|0]\s')   # To handle OCR issues (Q_ -> 0_)      
        search_value = search_value.replace('/', '[/|1|7|\.]') # To handle OCR issues (/ -> 1 or 7 or .)

        
        position_pattern = self._find_position_pattern(pdf_text)
        if position_pattern:
            for match in re.finditer(re.compile(search_value), pdf_text):
                if match.start() > position_pattern:
                    return match.start(), match.group()

class MatchTypo(MatchFunction):
    
    def __init__(self, tolerance=2):
        self._tolerance = tolerance
    
    def find_match(self, pdf_text, search_value):       
        pdf_text = pdf_text.lower()
        search_value = search_value.lower()

        r = regex.compile('(%s){e<=%i}'%(search_value, self._tolerance), flags=regex.BESTMATCH)
        match = r.search(pdf_text)
        if match:
            match_value = match.group(0)
            start_index = pdf_text.find(match_value)
            return start_index, match_value

assert MatchTypo().find_match('I am Flavien Prost in Google', 'Flavien Prost')[1] == 'flavien prost'
assert MatchTypo().find_match('I am Flavien Prost in Google', 'Flavian Prost')[1] == 'flavien prost'
assert MatchTypo().find_match('I am Flavien Prost in Google', 'Flavian Qrt') is None


class MatchApplicant(MatchFunction):
    def __init__(self):
        pass
    
    def find_match(self, pdf_text, search_value):
        pdf_text = pdf_text.lower()
        search_value = search_value.lower()

        search_value = search_value.replace(';', '[;|:]')
        search_value = search_value.replace('(', '\(').replace(')', '\)')

        match = re.search(re.compile(search_value), pdf_text)
        if match:
            start_index = pdf_text.find(match.group(0))
            return start_index, match.group(0)

# allow OCR to give : instead of ;
assert MatchApplicant().find_match('Dikla Dotan-Cohen, Herzliya (IL);', 
                                   'Dikla Dotan-Cohen, Herzliya (IL);')[1] == 'Dikla Dotan-Cohen, Herzliya (IL);'.lower()
assert MatchApplicant().find_match('Dikla Dotan-Cohen, Herzliya (IL):', 
                                   'Dikla Dotan-Cohen, Herzliya (IL);')[1] == 'Dikla Dotan-Cohen, Herzliya (IL):'.lower()



    

In [33]:
LIST_FIELDS = {
    'publication_date': GeneralMatch(),
    'classification_1': MatchClassification_v2(pattern_keyword_before='Int\.? C[I|L|1]'),
    'classification_2': MatchClassification_v2(pattern_keyword_before='U.S. C[I|L|1]'),
    'application_number': GeneralMatch(),
    'filing_date': MatchTypo(tolerance=1),
    'applicant': MatchTypo(),
    'inventor': MatchTypo(),
    'publication_number': GeneralMatch(),
    'priority': GeneralMatch(),
    'representative': GeneralMatch(),
    'titleFL': MatchTypo(),
    'titleSL': MatchTypo(),
    #'abstractFL': MatchTypo(),
}


In [34]:
for field  in LIST_FIELDS:
    match_fn = LIST_FIELDS[field]

    count_value_found = 0
    count_value_not_found = 0
    for _index in range(0, len(pdf_df)):
    
        _pdf_file = list(pdf_df['filename'])[_index].replace('.output-1-to-1.json', '.pdf')
            
        _text = list(pdf_df['text'])[_index]#.decode('utf-8')   #python3 issue?
        
        # Grab the value from ground truth
        selected_df = df_truth[df_truth['file_name']==_pdf_file]
        if len(selected_df) != 1:
            continue
        else:
            _field_value_in_truth = selected_df[field].iloc[0]
        
        if isinstance(_field_value_in_truth, float) and math.isnan(_field_value_in_truth):
            continue
        
        match = match_fn.find_match(_text, _field_value_in_truth)
        if match:
            found_match = True
        else:
            found_match = False
            #print(_pdf_file)
            #print("....................")
            #print(_text)
            #print("....................")
            #print(_field_value_in_truth)
            #print("....................")
            #print("....................")

        count_value_found += int(found_match)
        count_value_not_found += int(not found_match)
    
    recall = float(count_value_found)/ (count_value_found + count_value_not_found)
    print ('{}: {}'.format(field, recall))
    

publication_number: 0.9895833333333334
publication_date: 1.0
classification_2: 0.9791666666666666
priority: 1.0
titleSL: 1.0
application_number: 1.0
titleFL: 0.9795918367346939
applicant: 1.0
classification_1: 0.9846938775510204
filing_date: 1.0
inventor: 0.9897959183673469
representative: 0.9270833333333334


#  Construct JSONL

In [35]:
def create_jsonsl(pdf_text, value_dict):
    """Constructs the jsonl for a given pdf.
    
    Args:
      pdf_text: Text of the pdf.
      value_dict: a dictionary of fieldname: fieldvalue.
    """
    
    pdf_text = pdf_text.replace('"', '')
    jsonl = ['''{"annotations": [''']
    for field in value_dict:
        value_to_find = value_dict[field]
        
        if isinstance(value_to_find, float) and math.isnan(value_to_find):
            continue
            
        match_fn = LIST_FIELDS[field]
        match = match_fn.find_match(pdf_text, value_to_find)
        if match:
          start_index, match_value = match
          if (start_index != -1):
            end_index = start_index + len(match_value)
            jsonl.append('''{{"text_extraction": {{"text_segment": {{"end_offset": {}, "start_offset": {}}}}},"display_name": "{}"}},'''.format(
                end_index, start_index, field))

    
    jsonl[-1] = jsonl[-1].replace('"},', '"}') # Remove last comma
    jsonl.append(u'''],"text_snippet":{{"content": "{}"}}}}'''.format(pdf_text.replace('\n', '\\n')))
    
    jsonl_final = ''.join(jsonl)
    return jsonl_final

In [37]:
LIST_JSONL = []

for _index in range(0, len(pdf_df)):

    _pdf_file = list(pdf_df['filename'])[_index].replace('.output-1-to-1.json', '.pdf')
    _text = list(pdf_df['text'])[_index]#.decode('utf-8')
    
    selected_df = df_truth[df_truth['file_name']==_pdf_file]
    if len(selected_df) != 1:
        continue
        
    jsonl = create_jsonsl(
        _text,
        {_field: dict(selected_df)[_field].iloc[0] for _field in dict(selected_df) if _field in LIST_FIELDS}
    )
    LIST_JSONL.append(jsonl)
    
print (len(LIST_JSONL))

196


In [38]:
LIST_JSONL[:5]

['{"annotations": [{"text_extraction": {"text_segment": {"end_offset": 730, "start_offset": 710}},"display_name": "titleSL"},{"text_extraction": {"text_segment": {"end_offset": 121, "start_offset": 108}},"display_name": "publication_date"},{"text_extraction": {"text_segment": {"end_offset": 758, "start_offset": 747}},"display_name": "applicant"},{"text_extraction": {"text_segment": {"end_offset": 1235, "start_offset": 1224}},"display_name": "classification_2"},{"text_extraction": {"text_segment": {"end_offset": 1114, "start_offset": 1107}},"display_name": "application_number"},{"text_extraction": {"text_segment": {"end_offset": 709, "start_offset": 687}},"display_name": "titleFL"},{"text_extraction": {"text_segment": {"end_offset": 83, "start_offset": 73}},"display_name": "publication_number"},{"text_extraction": {"text_segment": {"end_offset": 1165, "start_offset": 1155}},"display_name": "classification_1"},{"text_extraction": {"text_segment": {"end_offset": 1140, "start_offset": 1127

# Upload JSONL

In [39]:
def save_jsonl_content(jsonl, full_gcs_path): 
    
    bucket_name, blob_name = get_bucket_blob(full_gcs_path)
    
    client = storage.Client.from_service_account_json(SERVICE_ACCOUNT_PATH)
    bucket = client.get_bucket(bucket_name)
    blob_csv = bucket.blob(blob_name)

    blob_csv.upload_from_string(jsonl)

In [40]:
save_jsonl_content(
    "\n".join(LIST_JSONL), 'gs://modeling-work-v1/ner_model/all_english_022.jsonl')

In [None]:
# IF YOU WANT TO SPLIT YOURSELF
# from sklearn.model_selection import train_test_split

# #Split train test and validation jsonl files
# X_train, X_test = train_test_split(LIST_JSONL,test_size=0.2, random_state=42)
# X_validation, X_test = train_test_split(X_test, test_size=0.5, random_state=42)

# save_jsonl_content("\n".join(X_train), 'gs://modeling-work-v1/ner_model/train_0207.jsonl')
# save_jsonl_content("\n".join(X_validation), 'gs://modeling-work-v1/ner_model/eval_0207.jsonl')
# save_jsonl_content("\n".join(X_test), 'gs://modeling-work-v1/ner_model/test_0207.jsonl')

In [None]:
LIST_JSONL[30:50]

In [41]:
def get_content(gcs_path):
    match = re.match(r'gs://([^/]+)/(.+)', gcs_path) # split bucket/path
    bucket_name = match.group(1)
    prefix = match.group(2)
    client = storage.Client()
    content_dict = {}  
    bucket = client.get_bucket(bucket_name)

    blob_list = list(bucket.list_blobs(prefix=prefix))
    for blob in blob_list:
        gcs_uris_list = []
        JSON_PATH = os.path.join('gs://', blob.bucket.name, blob.name)
        content_dict[JSON_PATH] = vis_b_v1p2_json_path_to_text(JSON_PATH)
    return content_dict     

In [59]:
file = 'gs://pdf-processing-219114/patents_test_fprost/json/us_047.output-1-to-1.json'
x = get_content(file)

In [60]:
print (x[file])

US010143012B2
(12) United States Patent
Stattin et al.
(10) Patent No.: US 10,143,012 B2
(45) Date of Patent: Nov. 27, 2018
(54) RANDOM ACCESS PROCEDURE IN
WIRELESS DEVICE, RADIO BASE STATION
AND METHODS THEREIN
(52) U.S. CI.
CPC ..... H04W 74/0833 (2013.01); H04L 41/0654
(2013.01); H04W 74/08 (2013.01)
(58) Field of Classification Search
None
See application file for complete search history.
(71)
Applicant: Telefonaktiebolaget L M Ericsson
(publ), Stockholm (SE)
(56)
References Cited
U.S. PATENT DOCUMENTS
(72) Inventors: Magnus Stattin, Upplands Väsby (SE);
Gunnar Bergquist, Kista (SE); Tao
Cui, Upplands Väsby (SE); Mats Folke,
Vällingby (SE); Gunnar Mildh,
Sollentuna (SE); Elena Myhre, Järfälla
(SE); Mikael Wittberg, Uppsala (SE)
2009/0186624 Al*
2010/0202288 A1*
7/2009 Cave .................. H04L 1/1887
455/450
8/2010 Park
H04W 48/08
370/230
(Continued)
Tak
.......
(73) Assignee: Telefonaktiebolaget LM Ericsson
(publ), Stockholm (SE)
FOREIGN PATENT DOCUMENTS
(*) Notice:
Subject to 