In [1]:
import numpy as np
import pandas as pd
import pickle
# SQL related packages
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
# sklearn packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.linear_model import LogisticRegression as Log
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, GridSearchCV
# text analysis packages
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
import gensim
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import word2vec

In [2]:
# access to sql database
dbname = 'patent_db'
username = 'jy'
pswd = 'jy'

engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))

# reading from sql database
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

In [3]:
# read data from 2004-2007
years = np.arange(2004, 2008)

# dataframe to store the results
abstracts = pd.DataFrame()

# import the abstract from each table
for year in years:
    # query:
    sql_query = """
    SELECT abstract, id, payment_times
        FROM patents_%s;
    """ %str(year)

    results = pd.read_sql_query(sql_query,con)
    
    abstracts = pd.concat([abstracts, results], axis = 0)
    
# check size of the data
abstracts.shape

(12033, 3)

In [4]:
# extract the response variable
# reformat the response variable into binary
y_data = np.zeros(abstracts.shape[0])
y_data[abstracts['payment_times'].values >= 2] = 1

print "Percentage of patents with > 1 maintenance fee payments: ", np.mean(y_data)

Percentage of patents with > 1 maintenance fee payments:  0.628521565694


In [4]:
# load the GoogleNews pre-trained model
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', 
                                                 binary=True) 

In [9]:
# function to tokenize and preprocess the text data
# it turns text into lowercase, removes stopwords and punctuation
# it does not stem the words (because the pre-trained model does not do that)
def tokenize_cleaning(text):
    # tokenize the text first
    tokens = word_tokenize(text.decode('utf-8'))
    
    # lowercase all the words
    tokens = [w.lower() for w in tokens]
    
    # clean up stop words and punctuations 
    #(but not including'-', because terminologies may include '-')
    stop_list = stopwords.words('english') + list(string.punctuation)

    tokens_no_stop = [token for token in tokens
                        if token not in stop_list]
    
    ## extract stem of the words
    #tokens_stem = [stemmer.stem(token) for token in tokens_no_stop]
    
    return tokens_no_stop

In [13]:
# process on the abstract data
# tokenize_clean the abstracts and count the occurence of the words

result = []
for i in range(abstracts.shape[0]):
    tokens = tokenize_cleaning(abstracts['abstract'].iloc[i])
    result.append(tokens)

In [24]:
# compute average word-vector for a text
def dec_vec(model, text):
    # store the vector for each word
    vectors = []
    
    # compute on each word
    for j in range(len(result[0])):
        try:
            vectors.append(word_vectors[result[0][j]])
        except:
            continue
    
    # reshape the result for easy computation   
    test = np.asarray(vectors)
    # return vector mean
    return np.mean(test, axis = 0)

In [30]:
abstract_vec = []

for i in range(abstracts.shape[0]):
    vec = dec_vec(word2vec, result[i])
    abstract_vec.append(vec)

In [33]:
abstract_vec = np.asarray(abstract_vec)
abstract_vec.shape

(12033, 300)

In [43]:
# sample the same number of'useful' patents as the 'not useful' patents
# size of each class
num_size = np.sum(y_data == 0)

#random shuffle the rows
n = abstract_vec.shape[0]
perm = range(n)
np.random.shuffle(perm)

abstract_vec = abstract_vec[perm]
y_data = y_data[perm]

# separate the two classes
x_useful = abstract_vec[y_data == 1, :]
x_not_useful = abstract_vec[y_data == 0, :]
y_useful = y_data[y_data == 1]
y_not_useful = y_data[y_data == 0]

# sample num_size from the 'useful' class
x_useful = x_useful[:num_size]
y_useful = y_useful[:num_size]

# combine the two classes
x_data_sub = np.concatenate((x_useful, x_not_useful), axis = 0)
y_data_sub = np.concatenate((y_useful, y_not_useful), axis = 0)

# shuffle again
# shuffle the combined data
n2 = x_data_sub.shape[0]
perm2 = range(n2)
np.random.shuffle(perm2)

x_data_sub = x_data_sub[perm2]
y_data_sub = y_data_sub[perm2]

# check the size
print x_data_sub.shape
print y_data_sub.shape

(8940, 300)
(8940,)


In [44]:
# split the training and testing data
# split train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data_sub, y_data_sub, 
                                                    test_size = 0.2, 
                                                    random_state = 123)

print "Dataset dimensions:"
print "x_train: ", x_train.shape
print "x_test: ", x_test.shape
print "y_train: ", y_train.shape
print "y_test: ", y_test.shape

Dataset dimensions:
x_train:  (7152, 300)
x_test:  (1788, 300)
y_train:  (7152,)
y_test:  (1788,)


In [45]:
### use logistic regression

# call the model function
model = Log()
# parameter tuning
c =  np.logspace(-5, 5, 11)

# use grid search with 5-fold CV
grid_model = GridSearchCV(model, param_grid = {'C': c}, cv  = 5, scoring = 'accuracy')
# fit on the data
grid_model = grid_model.fit(x_train, y_train) 

In [46]:
# check results
print "Best accuracy:", grid_model.best_score_
print "Best parameter: ", grid_model.best_params_

Best accuracy: 0.502237136465
Best parameter:  {'C': 1.0000000000000001e-05}


In [47]:
# check confusion matrix
best_rf = grid_model.best_estimator_
y_pred = cross_val_predict(best_rf, x_train, y_train, cv = 5)

metrics.confusion_matrix(y_train, y_pred)

array([[3592,    0],
       [3560,    0]])

In [48]:
### tune random forest

model = RandomForestClassifier(n_estimators = 100)

# tune max_features
param_space = np.arange(2, 15, 2)

grid_model = GridSearchCV(model, n_jobs = 4, 
                          param_grid = {'max_features': param_space}, 
                          cv  = 5, scoring = 'accuracy')
# fit on the data
grid_model = grid_model.fit(x_train, y_train)

In [49]:
# check results
print "Best accuracy:", grid_model.best_score_
print "Best parameter: ", grid_model.best_params_

Best accuracy: 0.502237136465
Best parameter:  {'max_features': 2}


In [51]:
# check confusion matrix
best_rf = grid_model.best_estimator_
y_pred = cross_val_predict(best_rf, x_train, y_train, cv = 5)

metrics.confusion_matrix(y_train, y_pred)

array([[3592,    0],
       [3560,    0]])