In [34]:
# import textract
import re
from time import sleep
import os
import pandas as pd
import json
from google_patent_scraper import scraper_class
import requests
from bs4 import BeautifulSoup

## Fields to parse
1. **Patent No.:/Patent Number: begin with US and end with //n** like USOORE49167E\n\n
2. **Title (54)**
3. **Abstract:**
4. Applicant: (organization or individual that files the patent application is called the “applicant.”)
5. Assignee:  (the entity that has the property right to the patent.)
6. Inventors: (OPT)

## What we have now
1. backward_cite_no_family = citations
2. inventor_name 
3. assignee_name_orig
4. assignee_name_current
5. date (pub_date, priority_date, grant_date)
6. Patent Number
7. TODO: Title
8. Abstract


In [35]:
scraper=scraper_class(return_abstract=True) 
global scraper

In [36]:
def extract_patent_number_pdf(txt):
    # Extract patent numbers
    return re.findall(r'US\w+\n', txt)[0][:-1].replace("OO", "")

def dump_json(patents_info):
    # the json file where the output must be stored
    out_file = open("patent_information.json", "w")
    json.dump(patents_info, out_file, indent = 6)
    out_file.close()


def web_scraping_fields(text, file_format='excel'):
    # ~~ Scrape patents individually ~~ #
    if file_format =='pdf':
        patent_number = extract_patent_number_pdf(text)
    elif file_format =='excel':
        patent_number = text
    err_1, soup_1, url_1 = scraper.request_single_patent(patent_number)
    # ~ Parse results of scrape ~ #
    patent_parsed = scraper.get_scraped_data(soup_1,patent_number,url_1)
    # ~ Clean abstract ~ #
    patent_parsed["abstract_text"] = re.sub('\n', '', patent_parsed["abstract_text"])
    patent_parsed["abstract_text"] = re.sub(' +', ' ', patent_parsed["abstract_text"]).strip()
    # ~ Change the type to be list from string~ #
    patent_parsed["inventor_name"]      = json.loads(patent_parsed["inventor_name"])
    patent_parsed["assignee_name_orig"] = json.loads(patent_parsed["assignee_name_orig"])
    patent_parsed["patent_citations"] = json.loads(patent_parsed["backward_cite_no_family"])
    patent_parsed["assignee_name_current"] = json.loads(patent_parsed["assignee_name_current"])
    # Clean abstract_text and assignee_name_current
    patent_parsed["abstract_text"] = re.sub('\n', '', patent_parsed["abstract_text"])
    patent_parsed["abstract_text"] = re.sub(' +', ' ', patent_parsed["abstract_text"]).strip()
    patent_parsed["assignee_name_current"] = {"assignee_name": [re.sub('\n', '', asg['assignee_name'].strip()) 
                                              for asg in patent_parsed["assignee_name_current"]]}
    del patent_parsed["forward_cite_no_family"]
    del patent_parsed["forward_cite_yes_family"]
    del patent_parsed["backward_cite_no_family"]
    del patent_parsed["backward_cite_yes_family"]

    # Extract title
    url='https://patents.google.com/patent/{0}'.format(patent_number)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    title = soup.find_all('title')[0].get_text()
    title = re.sub(' +', ' ', title)
    patent_parsed['title'] = re.findall(r"-\s+(.*)\s+-", title)[0].strip()
    
    # Extract body message
    summary = soup.find_all("div", {"class": "description-paragraph"})
    body_message = [txt.get_text() for txt in summary]
    patent_parsed['body_message'] = ' '.join(body_message)
    
    return patent_parsed

def patent_information_extraction(files, file_format='excel'):
    failed_patents  = []
    success_list = []
    for file_name in files:
        #file_name = in_folder + file
        if file_format == 'pdf':
            text_id = textract.process(file_name).decode("utf8")
            text_id = text.replace('\\\\', '\\')
            text_id = re.sub('\\n\\n', '\\n', text_id)
        elif file_format == 'excel':
            text_id = file_name

        try:
            patent_parsed = web_scraping_fields(text_id)
            success_list.append(patent_parsed)
            #print(patent_parsed)
        except Exception as e:
            print(e)
            failed_patents.append(file_name)
        
        if len(success_list) == 1000 or len(success_list) == 5000 or len(success_list) == 10000 or len(success_list) == 15000 or len(success_list) == 20000 or len(success_list) == 25000 or len(success_list) == 30000 or len(success_list) == 35000:
#             print('dump')
            dump_json(success_list)
            
        sleep(1)
    return success_list, failed_patents



In [37]:
## Change this directory 
patent_df = pd.read_excel('../data/ev-batch1.xlsx', header=1)
file_id = [link.replace("-", "") for link in patent_df["id"]][:800]
print(len(file_id))
# Show top 10
file_id[:10]

800


['US9533588B2',
 'US9987944B2',
 'US10071639B2',
 'US9929440B2',
 'US10486690B2',
 'US10112603B2',
 'US10589736B2',
 'US10308240B2',
 'US10640103B2',
 'US10479180B2']

In [38]:
# Read the patent_information_5000.json file to find all patents
f = open("patent_information_5000.json")
# returns JSON object as a dictionary
patent_dict = json.load(f)
f.close()


In [39]:
# FRom 5000 files, choose only 600 files as query and find related citations 
citation_list = list(set([ref['patent_number'] for patent in patent_dict[:800] for ref in patent['patent_citations']]))
len(citation_list)

40832

In [40]:
# Print top 10 of citation list
citation_list[:10]

['US20130284531A1',
 'US7657828B2',
 'US20150091698A1',
 'US5983208A',
 'US8165891B2',
 'US20100125387A1',
 'US20180115196A1',
 'US7176654B2',
 'JP2003517158A',
 'US20120083917A1']

In [41]:
## Merge query(600 documents) with citation_list(33580)
file_id.extend(citation_list)
len(file_id)

41632

In [42]:
file_id[:800]

['US9533588B2',
 'US9987944B2',
 'US10071639B2',
 'US9929440B2',
 'US10486690B2',
 'US10112603B2',
 'US10589736B2',
 'US10308240B2',
 'US10640103B2',
 'US10479180B2',
 'US10507730B2',
 'US10763690B2',
 'US9834199B2',
 'US9073439B2',
 'US10440880B2',
 'US11159043B2',
 'US10756549B1',
 'US10879733B2',
 'US2022036020A1',
 'US2021162881A1',
 'US10699305B2',
 'US10131248B2',
 'US10173544B2',
 'US2019039479A1',
 'US2019061535A1',
 'US2019152338A1',
 'US10988013B2',
 'US9827972B2',
 'US11222251B2',
 'US9783076B2',
 'US10814734B2',
 'US10532663B2',
 'US9567034B2',
 'US10857887B2',
 'US9592742B1',
 'US11358484B2',
 'US2020143670A1',
 'US9126494B2',
 'US11054495B2',
 'US9545968B2',
 'US9296312B2',
 'US11177700B2',
 'US9575533B2',
 'US10017174B2',
 'US9845123B2',
 'US2020331536A1',
 'US11349313B2',
 'US11312444B2',
 'US11247568B2',
 'US10805659B2',
 'US10875411B2',
 'USRE48837E',
 'US10913371B2',
 'US10872361B2',
 'US11407320B2',
 'US10245964B2',
 'US10333318B2',
 'US11433772B2',
 'US10300805B2',

In [33]:
patents_info, failed_patents = patent_information_extraction(file_id, file_format='excel')

https://patents.google.com/patent/US9533588B2
https://patents.google.com/patent/US9987944B2
https://patents.google.com/patent/US10071639B2
https://patents.google.com/patent/US9929440B2
https://patents.google.com/patent/US10486690B2
dump


In [None]:
# the json file where the output must be stored
out_file = open("patent_information.json", "w")
json.dump(patents_info, out_file, indent = 6)
out_file.close()
print('Donwloaded {} files'.format(len(patents_info)))