In [1]:
import numpy as np
import pandas as pd
import pickle
# SQL related packages
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
# sklearn packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.linear_model import LogisticRegression as Log
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, GridSearchCV
# text analysis packages
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer

In [2]:
# access to sql database
dbname = 'patent_db'
username = 'jy'
pswd = 'jy'

engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))

# reading from sql database
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

#### Abstract text first

In [3]:
# read data from 2004-2007
years = np.arange(2004, 2008)

# dataframe to store the results
abstracts = pd.DataFrame()

# import the abstract from each table
for year in years:
    # query:
    sql_query = """
    SELECT abstract, id, payment_times
        FROM patents_%s;
    """ %str(year)

    results = pd.read_sql_query(sql_query,con)
    
    abstracts = pd.concat([abstracts, results], axis = 0)
    
# check size of the data
abstracts.shape

(12033, 3)

In [4]:
# extract the response variable
# reformat the response variable into binary
y_data = np.zeros(abstracts.shape[0])
y_data[abstracts['payment_times'].values >= 2] = 1

print "Percentage of patents with > 1 maintenance fee payments: ", np.mean(y_data)

Percentage of patents with > 1 maintenance fee payments:  0.628521565694


In [5]:
# bag-of-words on the abstract data
# eliminate common stop words
vectorizer = CountVectorizer(stop_words='english')

# apply to abstracts
corpus = abstracts['abstract'].values

In [6]:
### count words in abstracts
word_vec = vectorizer.fit(corpus)
# convert to array
word_counts = word_vec.transform(corpus)
word_counts = word_counts.toarray()

In [7]:
### check the top words
word_counts_sum = np.sum(word_counts, axis = 0)
# order the words (ascending)
word_order = np.asarray(word_vec.get_feature_names())[word_counts_sum.argsort()]
# check the top 20 words
word_order[-20:]

array([u'data', u'material', u'comprising', u'device', u'disclosed',
       u'signal', u'layer', u'cell', u'used', u'cells', u'using',
       u'having', u'second', u'includes', u'provided', u'provides',
       u'present', u'methods', u'method', u'invention'], 
      dtype='<U48')

In [8]:
### remove words that appear in >50% of the patents
vectorizer = CountVectorizer(stop_words='english', max_df = 0.5, min_df=4)

# apply to abstracts
corpus = abstracts['abstract'].values

# count words in abstracts
word_vec = vectorizer.fit(corpus)
# convert to array
word_counts = word_vec.transform(corpus)
word_counts = word_counts.toarray()

#### Combine words with the same stemmer

In [9]:
stemmer = PorterStemmer()

#### Subsampling and spliting train-test data

In [9]:
# sample the same number of'useful' patents as the 'not useful' patents
# size of each class
num_size = np.sum(y_data == 0)

#random shuffle the rows
n = word_counts.shape[0]
perm = range(n)
np.random.shuffle(perm)

word_counts = word_counts[perm]
y_data = y_data[perm]

# separate the two classes
x_useful = word_counts[y_data == 1, :]
x_not_useful = word_counts[y_data == 0, :]
y_useful = y_data[y_data == 1]
y_not_useful = y_data[y_data == 0]

# sample num_size from the 'useful' class
x_useful = x_useful[:num_size]
y_useful = y_useful[:num_size]

# combine the two classes
x_data_sub = np.concatenate((x_useful, x_not_useful), axis = 0)
y_data_sub = np.concatenate((y_useful, y_not_useful), axis = 0)

# shuffle again
# shuffle the combined data
n2 = x_data_sub.shape[0]
perm2 = range(n2)
np.random.shuffle(perm2)

x_data_sub = x_data_sub[perm2]
y_data_sub = y_data_sub[perm2]

# check the size
print x_data_sub.shape
print y_data_sub.shape

(8940, 10223)
(8940,)


In [10]:
# split the training and testing data
# split train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data_sub, y_data_sub, 
                                                    test_size = 0.2, 
                                                    random_state = 123)

print "Dataset dimensions:"
print "x_train: ", x_train.shape
print "x_test: ", x_test.shape
print "y_train: ", y_train.shape
print "y_test: ", y_test.shape

Dataset dimensions:
x_train:  (7152, 10223)
x_test:  (1788, 10223)
y_train:  (7152,)
y_test:  (1788,)


In [11]:
# try Naive Bayes with Gaussian Distribution
gnb = GaussianNB()

# fit on the training data
gnb.fit(x_train, y_train)

GaussianNB(priors=None)

In [12]:
# predict on the test data
y_pred = gnb.predict(x_test)

In [13]:
# accuracy
np.mean(y_pred == y_test)

0.58333333333333337

In [14]:
metrics.confusion_matrix(y_test, y_pred)

array([[582, 301],
       [444, 461]])

### PCA

In [15]:
# try dimensionality reduction using PCA
pca = PCA()

x_train_pca = pca.fit_transform(x_train)
y_train_pca = pca.fit(x_test)

In [16]:
# find the cum-variance explained at each level
total_var = np.cumsum(pca.explained_variance_ratio_)
n_pc = np.where((total_var > 0.9) == True)[0][0]

In [17]:
n_pc

787

In [18]:
### use logistic regression

# call the model function
model = Log()
# parameter tuning
c =  np.logspace(-5, 5, 11)

# use grid search with 5-fold CV
grid_model = GridSearchCV(model, param_grid = {'C': c}, cv  = 5, scoring = 'accuracy')
# fit on the data
grid_model = grid_model.fit(x_train_pca[:, :n_pc], y_train) 

In [20]:
# check results
print "Best accuracy:", grid_model.best_score_
print "Best parameter: ", grid_model.best_params_

Best accuracy: 0.556487695749
Best parameter:  {'C': 1.0000000000000001e-05}


In [21]:
# check confusion matrix
best_rf = grid_model.best_estimator_
y_pred = cross_val_predict(best_rf, x_train_pca[:, :n_pc], y_train, cv = 5)

metrics.confusion_matrix(y_train, y_pred)

array([[2300, 1287],
       [1885, 1680]])

In [22]:
### tune random forest

model = RandomForestClassifier(n_estimators = 100)

# tune max_features
param_space = np.arange(2, 15, 2)

grid_model = GridSearchCV(model, n_jobs = 4, 
                          param_grid = {'max_features': param_space}, 
                          cv  = 5, scoring = 'accuracy')
# fit on the data
grid_model = grid_model.fit(x_train_pca[:, :n_pc], y_train)

In [23]:
# check results
print "Best accuracy:", grid_model.best_score_
print "Best parameter: ", grid_model.best_params_

Best accuracy: 0.556487695749
Best parameter:  {'max_features': 10}


In [24]:
# check confusion matrix
best_rf = grid_model.best_estimator_
y_pred = cross_val_predict(best_rf, x_train[:, :n_pc], y_train, cv = 5)

metrics.confusion_matrix(y_train, y_pred)

array([[2252, 1335],
       [1933, 1632]])