In [2]:
import numpy as np
import pandas as pd
import pickle
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as Log
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict, GridSearchCV

#### Load the data

In [3]:
# read the data with the non-text features
patents = pd.read_pickle("patent_data/nontext_features.p")

In [4]:
print patents.shape
patents.head()

(12030, 16)


Unnamed: 0,id,publication_year,B,C,D,E,F,G,H,num_applications,num_patent_citations,num_nonpatent_citations,num_claims,num_similar_doc,num_authors,payment_times
0,US6699658B1,2004,0,1,0,0,0,0,0,5,28,34,42,1,4,3
1,US6699724B1,2004,0,0,0,0,0,1,0,32,47,44,25,0,4,3
2,US6690816B2,2004,0,0,0,0,0,1,0,8,9,0,32,1,4,1
3,US6711436B1,2004,0,0,0,0,0,0,0,4,105,109,45,7,1,3
4,US6711432B1,2004,0,0,0,0,0,0,0,7,15,114,44,3,4,3


#### Format the data

In [5]:
# reformat the response variable into binary
y_data = np.zeros(patents.shape[0])
y_data[patents['payment_times'].values >= 2] = 1

print "Percentage of patents with > 1 maintenance fee payments: ", np.mean(y_data)

Percentage of patents with > 1 maintenance fee payments:  0.62859517872


In [6]:
# predictors
x_data = patents.drop(['id', 'payment_times'], axis = 1).values
x_data.shape

(12030, 14)

In [8]:
### subsampling the data
# sample the same number of'useful' patents as the 'not useful' patents
# size of each class
num_size = np.sum(y_data == 0)

#random shuffle the rows
n = x_data.shape[0]
perm = range(n)
np.random.shuffle(perm)

x_data = x_data[perm]
y_data = y_data[perm]

# separate the two classes
x_useful = x_data[y_data == 1, :]
x_not_useful = x_data[y_data == 0, :]
y_useful = y_data[y_data == 1]
y_not_useful = y_data[y_data == 0]

# sample num_size from the 'useful' class
x_useful = x_useful[:num_size]
y_useful = y_useful[:num_size]

# combine the two classes
x_data_sub = np.concatenate((x_useful, x_not_useful), axis = 0)
y_data_sub = np.concatenate((y_useful, y_not_useful), axis = 0)

# shuffle again
# shuffle the combined data
n2 = x_data_sub.shape[0]
perm2 = range(n2)
np.random.shuffle(perm2)

x_data_sub = x_data_sub[perm2]
y_data_sub = y_data_sub[perm2]

# check the size
print x_data_sub.shape
print y_data_sub.shape

(8936, 14)
(8936,)


In [9]:
# split train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data_sub, y_data_sub, 
                                                    test_size = 0.2, 
                                                    random_state = 123)

print "Dataset dimensions:"
print "x_train: ", x_train.shape
print "x_test: ", x_test.shape
print "y_train: ", y_train.shape
print "y_test: ", y_test.shape

Dataset dimensions:
x_train:  (7148, 14)
x_test:  (1788, 14)
y_train:  (7148,)
y_test:  (1788,)


### Model training
#### Basic Logistic Regression

In [10]:
### use logistic regression

# call the model function
model = Log()
# parameter tuning
c =  np.logspace(-5, 5, 11)

# use grid search with 5-fold CV
grid_model = GridSearchCV(model, param_grid = {'C': c}, cv  = 5, scoring = 'accuracy')
# fit on the data
grid_model = grid_model.fit(x_train, y_train) 

In [11]:
# check results
print "Best accuracy:", grid_model.best_score_
print "Best parameter: ", grid_model.best_params_

Best accuracy: 0.565332960269
Best parameter:  {'C': 0.10000000000000001}


In [12]:
# check confusion matrix
best_log = grid_model.best_estimator_
y_pred = cross_val_predict(best_log, x_train, y_train, cv = 5)

metrics.confusion_matrix(y_train, y_pred)

array([[2122, 1452],
       [1655, 1919]])

#### Logistic Regression with Balanced Weight

In [13]:
# call the model function with balanced weight
model = Log(class_weight='balanced')
# parameter tuning
c =  np.logspace(-5, 5, 11)

# use grid search with 5-fold CV
grid_model = GridSearchCV(model, param_grid = {'C': c}, cv  = 5, scoring = 'accuracy')
# fit on the data
grid_model = grid_model.fit(x_train, y_train) 

In [14]:
# check results
print "Best accuracy:", grid_model.best_score_
print "Best parameter: ", grid_model.best_params_

# check confusion matrix
best_log = grid_model.best_estimator_
y_pred = cross_val_predict(best_log, x_train, y_train, cv = 5)

metrics.confusion_matrix(y_train, y_pred)

Best accuracy: 0.565332960269
Best parameter:  {'C': 0.10000000000000001}


array([[2122, 1452],
       [1655, 1919]])

#### Random Forest

In [15]:
### tune random forest

model = RandomForestClassifier(n_estimators = 100)

# tune max_features
param_space = np.arange(2, 15, 2)

grid_model = GridSearchCV(model, n_jobs = 4, 
                          param_grid = {'max_features': param_space}, 
                          cv  = 5, scoring = 'accuracy')
# fit on the data
grid_model = grid_model.fit(x_train, y_train)

In [16]:
# check results
print "Best accuracy:", grid_model.best_score_
print "Best parameter: ", grid_model.best_params_

Best accuracy: 0.539171796307
Best parameter:  {'max_features': 2}


In [17]:
# check confusion matrix
best_rf = grid_model.best_estimator_
y_pred = cross_val_predict(best_rf, x_train, y_train, cv = 5)

metrics.confusion_matrix(y_train, y_pred)

array([[1991, 1583],
       [1669, 1905]])