In [1]:
import numpy as np
import pandas as pd
import patent_scraper as ps
import time
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pickle

In [26]:
# read the list of patents and combine into one dataframe
patent_list = pd.DataFrame()

# location and names of the files
base_dir = "patent_lists/patent_list_20040"

for i in np.arange(1, 11, 3): #downloaded by every 3 months
    # read the list
    if i < 10:
        data = pd.read_csv(base_dir + str(i) + ".csv", skiprows = 1)
    else:
        data = pd.read_csv(base_dir[:len(base_dir)-1] + str(i) + ".csv", skiprows = 1)
    # combine with previous lists
    patent_list = pd.concat([patent_list, data], axis = 0)

In [27]:
# check dimension of the list
patent_list.shape

(2996, 9)

In [28]:
# initiate lists to store the results
abstract_list = []
patent_class_list = []
num_applications_list = []
patent_citations_list = []
non_patent_citations_list = []
description_list = []
num_claims_list = []
claim_content_list = []
similar_doc_num_list = []
payment_times_list = []

In [29]:
# scrape information for each patent
for i in range(patent_list.shape[0]):
    # track the progress
    if i % 500 == 0:
        print (i)
    
    # read patent result url
    url = patent_list['result link'].values[i]
    # read the page
    soup = ps.generate_soup(url)
    
    # add number of maintenance payments
    payment_times_list.append(ps.find_maintainance_years(soup))
    # add abstract
    abstract_list.append(ps.find_patent_abstract(soup))
    # add top classification of the patent
    patent_class_list.append(ps.find_patent_class(soup))
    # number of fine applications
    num_applications_list.append(ps.find_patent_applications(soup))
    # number of citations
    patent_citations, non_patent_citations = ps.find_citation_nums(soup)
    patent_citations_list.append(patent_citations)
    non_patent_citations_list.append(non_patent_citations)
    # add background and summary description
    description = ps.read_patent_content(soup)
    description_list.append(description)
    # add claims
    num_claims, claim_content = ps.read_patent_claims(soup)
    num_claims_list.append(num_claims)
    claim_content_list.append(claim_content)
    # add similar documents
    similar_doc_num_list.append(ps.count_similar_documents(soup))
    
    time.sleep(0.1)

0
500
1000
1500
2000
2500


In [30]:
# add the information into the patent list
patent_list["abstract"] = abstract_list
patent_list["classification"] = patent_class_list
patent_list["num_applications"] = num_applications_list
patent_list["num_patent_citations"] = patent_citations_list
patent_list["num_nonpatent_citations"] = non_patent_citations_list
patent_list["description"] = description_list
patent_list["num_claims"] = num_claims_list
patent_list["claims"] = claim_content_list 
patent_list["num_similar_doc"] = similar_doc_num_list
patent_list["payment_times"] = payment_times_list

In [31]:
## store the patent data
## file is too big, and cannot be read into pandas again
#patent_list.to_csv("patent_data/patents_2012.csv", encoding='utf-8')

# try pickle dump
pickle.dump(patent_list, open("patent_data/patents_2004.p", "wb"))

In [39]:
# try pickle open it
patent_list = pickle.load(open("patent_data/patents_2004.p", "rb"))
patent_list.shape

(2996, 19)

In [40]:
# extract the number of inventors for each file
num_authors = []

for i in range(patent_list.shape[0]):
    num = len(patent_list['inventor/author'].values[i].split(","))
    num_authors.append(num)
    
# add to the dataframe
patent_list['num_authors'] = num_authors

patent_list.shape

(2996, 20)

In [42]:
patent_list.head()

Unnamed: 0,id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,abstract,classification,num_applications,num_patent_citations,num_nonpatent_citations,description,num_claims,claims,num_similar_doc,payment_times,num_authors
0,US6699658B1,Yeast cell surface display of proteins and use...,Board Of Trustees Of The University Of Illinois,"K. Dane Wittrup, David M. Kranz, Michele Kieke...",1996-05-31,1998-01-20,2004-03-02,2004-03-02,https://patents.google.com/patent/US6699658B1/en,\n The present invention provides a geneti...,C,5,28,34,This application is a continuation-in-part of ...,42,1. A method for selecting proteins for display...,1,3,4
1,US6699724B1,Metal nanoshells for biosensing applications,Wm. Marsh Rice University,"Jennifer L. West, Nancy J. Halas, Steven J. Ol...",1998-03-11,2000-07-14,2004-03-02,2004-03-02,https://patents.google.com/patent/US6699724B1/en,\n The present invention provides nanoshel...,G,32,47,44,This application is a continuation-in-part of ...,25,1. A chemical sensing device comprising a plur...,0,3,4
2,US6690816B2,Systems and methods for tubular object process...,The University Of North Carolina At Chapel Hill,"Stephen R. Aylward, Elizabeth Bullitt, Stephen...",2000-04-07,2001-04-09,2004-02-10,2004-02-10,https://patents.google.com/patent/US6690816B2/en,\n Systems and methods are disclosed for p...,G,8,9,0,The present invention is directed generally to...,32,1. A method for processing at least one tubula...,1,1,4
3,US6711436B1,"Compositions, apparatus and methods for facili...",Duke University,Francis G. Duhaylongsod,1997-08-08,1999-09-27,2004-03-23,2004-03-23,https://patents.google.com/patent/US6711436B1/en,\n Methods are provided for conducting dia...,A,4,105,109,This application is a continuation-in-part of ...,45,1. A method of performing an aortic aneurysm r...,7,3,1
4,US6711432B1,Computer-aided orthopedic surgery,Carnegie Mellon University,"Norman M. Krause, Lee E. Weiss, Kenji Shimada,...",2000-10-23,2000-10-23,2004-03-23,2004-03-23,https://patents.google.com/patent/US6711432B1/en,\n Devices and methods for implementing co...,A,7,15,114,The present invention generally relates to dev...,44,1. A method of generating an updated surgical ...,3,3,4


In [4]:
# access to sql database
dbname = 'patent_db'
username = 'jy'
pswd = 'jy'

engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))

In [36]:
# store patent data into the database
#patent_list.to_sql('patents_2006', engine, if_exists='replace')
patent_list.to_sql('patents_2004', engine)

In [None]:
# 2004 data has one corrupted row at 518 (error when imported to SQL)
# remove it
patent_list_drop = pd.concat([patent_list[0:518], patent_list[519:]], axis = 0)
patent_list_drop.shape

patent_list_drop.to_sql('patents_2004', engine, if_exists='replace')

In [20]:
# try reading from sql database
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

# query:
sql_query = """
SELECT COUNT(payment_times) FROM patents_2004 WHERE payment_times < 2;
"""
patent_data_from_sql = pd.read_sql_query(sql_query,con)

patent_data_from_sql

Unnamed: 0,index,id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,...,classification,num_applications,num_patent_citations,num_nonpatent_citations,description,num_claims,claims,num_similar_doc,payment_times,num_authors
