In [57]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pickle

In [2]:
# access to sql database
dbname = 'patent_db'
username = 'jy'
pswd = 'jy'

engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))

In [15]:
# reading from sql database
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [28]:
# read data from 2004-2007
years = np.arange(2004, 2008)

# dataframe to store the results
patents = pd.DataFrame()

# import the numerical features from each table
for year in years:
    # query:
    sql_query = """
    SELECT classification, num_applications, 
        num_patent_citations, num_nonpatent_citations, 
        num_claims, num_similar_doc, payment_times
        FROM patents_%s;
    """ %str(year)

    results = pd.read_sql_query(sql_query,con)
    
    patents = pd.concat([patents, results], axis = 0)

In [29]:
patents.shape

(12033, 7)

In [30]:
# one-hot-encoding classifications
one_hot_class = pd.get_dummies(patents['classification'], drop_first=True)
one_hot_class.shape

(12033, 8)

In [46]:
# quantitative variable
quant_cols = patents.drop('classification', axis = 1)
quant_cols.shape

(12033, 6)

In [54]:
# check missing values in the quant columns
# there is no missing data, which is consistent with the data collection method
for i in range(quant_cols.shape[1]):
    percent_na = np.mean(np.isnan(quant_cols.iloc[:, i].values))
    
    print quant_cols.columns[i], percent_na

num_applications 0.0
num_patent_citations 0.0
num_nonpatent_citations 0.0
num_claims 0.0
num_similar_doc 0.0
payment_times 0.0


In [56]:
# combine the quant and categorical columns for model training
patents = pd.concat([one_hot_class, quant_cols], axis = 1)
patents.shape

(12033, 14)

In [61]:
# pickle dump the data for later use
pickle.dump(patents, open("patent_data/nontext_features.p", "wb"))