In [8]:
import numpy as np
import pandas as pd
import patent_scraper as ps
import time
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

In [9]:
# read the list of patents and combine into one dataframe
patent_list = pd.DataFrame()

# location and names of the files
base_dir = "patent_lists/patent_list_20120"

for i in range(1, 6):
    # read the list
    data = pd.read_csv(base_dir + str(i) + ".csv", skiprows = 1)
    # combine with previous lists
    patent_list = pd.concat([patent_list, data], axis = 0)

In [10]:
# check dimension of the list
patent_list.shape

(2604, 9)

In [11]:
# initiate lists to store the results
abstract_list = []
patent_class_list = []
num_applications_list = []
patent_citations_list = []
non_patent_citations_list = []
description_list = []
num_claims_list = []
claim_content_list = []
similar_doc_num_list = []
payment_times_list = []

In [12]:
# scrape information for each patent
for i in range(patent_list.shape[0]):
    # track the progress
    if i % 500 == 0:
        print (i)
    
    # read patent result url
    url = patent_list['result link'].values[i]
    # read the page
    soup = ps.generate_soup(url)
    
    # add number of maintenance payments
    payment_times_list.append(ps.find_maintainance_years(soup))
    # add abstract
    abstract_list.append(ps.find_patent_abstract(soup))
    # add top classification of the patent
    patent_class_list.append(ps.find_patent_class(soup))
    # number of fine applications
    num_applications_list.append(ps.find_patent_applications(soup))
    # number of citations
    patent_citations, non_patent_citations = ps.find_citation_nums(soup)
    patent_citations_list.append(patent_citations)
    non_patent_citations_list.append(non_patent_citations)
    # add background and summary description
    description = ps.read_patent_content(soup)
    description_list.append(description)
    # add claims
    num_claims, claim_content = ps.read_patent_claims(soup)
    num_claims_list.append(num_claims)
    claim_content_list.append(claim_content)
    # add similar documents
    similar_doc_num_list.append(ps.count_similar_documents(soup))
    
    time.sleep(0.5)

0
500
1000
1500
2000
2500


In [13]:
# add the information into the patent list
patent_list["abstract"] = abstract_list
patent_list["classification"] = patent_class_list
patent_list["num_applications"] = num_applications_list
patent_list["num_patent_citations"] = patent_citations_list
patent_list["num_nonpatent_citations"] = non_patent_citations_list
patent_list["description"] = description_list
patent_list["num_claims"] = num_claims_list
patent_list["claims"] = claim_content_list 
patent_list["num_similar_doc"] = similar_doc_num_list
patent_list["payment_times"] = payment_times_list

In [19]:
## store the patent data
## file is too big, and cannot be read into pandas again
#patent_list.to_csv("patent_data/patents_2012.csv", encoding='utf-8')

In [14]:
# access to sql database
dbname = 'patent_db'
username = 'jy'
pswd = 'miaozhi'

engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))

In [15]:
# store patent data into the database
patent_list.to_sql('patents_2012', engine, if_exists='replace')

In [23]:
# try reading from sql database
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

# query:
sql_query = """
SELECT * FROM patents_2012 LIMIT 20;
"""
patent_data_from_sql = pd.read_sql_query(sql_query,con)

patent_data_from_sql

Unnamed: 0,index,id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,abstract,classification,num_applications,num_patent_citations,num_nonpatent_citations,description,num_claims,claims,num_similar_doc,payment_times
0,0,US8103611B2,"Architectures, systems, apparatus, methods, an...",New York University,"Alexander Tuzhilin, Gediminas Adomavicius",1997-11-14,2009-09-03,2012-01-24,2012-01-24,https://patents.google.com/patent/US8103611B2/en,\n Exemplary non-transitory computer-reada...,G,15,26,43,This application is a divisional application o...,52,a) accessing the multidimensional data which d...,11,1
1,1,US8098201B2,Radio frequency identification tag and radio f...,Electronics & Telecommunications Research Inst...,"Won Kyu CHOI, Jeong Seok Kim, Gil Young CHOI, ...",2007-11-29,2008-06-09,2012-01-17,2012-01-17,https://patents.google.com/patent/US8098201B2/en,\n An RFID tag includes an antenna and a c...,H,3,16,1,This application claims priority to and the be...,13,a first polygonal dielectric material having a...,16,1
2,2,US8096654B2,Active contact lens,University Of Washington Through Its Center Fo...,"Babak Amirparviz, Harvey Ho, Ehsan Saeedi",2007-03-07,2009-09-04,2012-01-17,2012-01-17,https://patents.google.com/patent/US8096654B2/en,\n An active contact lens system ( 100 ) a...,G,3,9,1,This application is a continuation of Internat...,20,a transparent substrate shaped to be worn dire...,16,1
3,3,US8093064B2,Method for using magnetic particles in droplet...,The Regents Of The University Of California,"Gaurav Jitendra Shah, Chang-Jin Kim",2008-05-15,2009-05-14,2012-01-10,2012-01-10,https://patents.google.com/patent/US8093064B2/en,\n Methods of utilizing magnetic particles...,B,19,8,14,This Application claims priority to U.S. Provi...,14,moving a meniscus of the droplet back and fort...,3,1
4,134,US8176786B2,"Methods, apparatuses, and systems for damage d...",Carnegie Mellon University,"Hoon Sohn, Seungbum Kim",2006-06-30,2007-06-29,2012-05-15,2012-05-15,https://patents.google.com/patent/US8176786B2/en,"\n Methods, apparatuses, and systems for d...",G,14,11,35,This application claims priority from Internat...,32,generating a first acoustic signal from a firs...,0,1
5,4,US8102406B2,Method and system for producing a video synopsis,Yissum Research Development Company Of The Heb...,"Shmuel Peleg, Alexander Rav-Acha",2005-11-15,2006-11-15,2012-01-24,2012-01-24,https://patents.google.com/patent/US8102406B2/en,\n A computer-implemented method and syste...,G,6,14,13,This application claims the benefit of U.S. pr...,18,(a) obtaining a subset of video frames in said...,10,1
6,5,US8095508B2,Intelligent data storage and processing using ...,Washington University,"Roger D. Chamberlain, Mark Allen Franklin, Ron...",2000-04-07,2004-05-21,2012-01-10,2012-01-10,https://patents.google.com/patent/US8095508B2/en,\n A data storage and retrieval device and...,G,3,290,180,This application claims the benefit of provisi...,102,a processing device; and a computer system hav...,13,1
7,6,US8106375B2,Resistance-switching memory based on semicondu...,The Trustees Of The University Of Pennsylvania,"I-Wei Chen, Yudi Wang, Soo Gil Kim",2005-11-30,2005-11-30,2012-01-31,2012-01-31,https://patents.google.com/patent/US8106375B2/en,"\n Resistance-switching oxide films, and d...",H,5,7,8,The invention generally relates to semiconduct...,117,at least about 75 atomic percent of an insulat...,4,1
8,7,US8090164B2,"Systems, methods, and computer program product...",The University Of North Carolina At Chapel Hill,"Elizabeth Bullitt, Stephen Aylward",2003-08-25,2004-07-16,2012-01-03,2012-01-03,https://patents.google.com/patent/US8090164B2/en,"\n Systems, methods, and computer program ...",G,4,22,112,This nonprovisional application claims the ben...,56,(a) developing an atlas including at least one...,1,1
9,8,US8090160B2,Automated method for human face modeling and r...,The University Of Houston System,"Ioannis A. Kakadiaris, George Toderici, Theoha...",2007-10-12,2008-10-13,2012-01-03,2012-01-03,https://patents.google.com/patent/US8090160B2/en,\n A novel method and system for 3d-aided-...,G,5,7,0,This application claim priority to and the ben...,14,enrolling a face of a subject into a gallery d...,5,1
