In [4]:
import numpy as np
import pandas as pd
import pickle
# SQL related packages
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
# sklearn packages
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.linear_model import LogisticRegression as Log
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, GridSearchCV
# text analysis packages
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# access to sql database
dbname = 'patent_db'
username = 'jy'
pswd = 'jy'

engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))

# reading from sql database
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

#### Extract abstracts first

In [3]:
# read data from 2004-2007
years = np.arange(2004, 2008)

# dataframe to store the results
abstracts = pd.DataFrame()

# import the abstract from each table
for year in years:
    # query:
    sql_query = """
    SELECT abstract, id, payment_times
        FROM patents_%s;
    """ %str(year)

    results = pd.read_sql_query(sql_query,con)
    
    abstracts = pd.concat([abstracts, results], axis = 0)
    
# check size of the data
abstracts.shape

(12033, 3)

In [90]:
# extract the response variable
# reformat the response variable into binary
y_data = np.zeros(abstracts.shape[0])
y_data[abstracts['payment_times'].values >= 2] = 1

print "Percentage of patents with > 1 maintenance fee payments: ", np.mean(y_data)

Percentage of patents with > 1 maintenance fee payments:  0.628521565694


#### Tokenization and cleaning

In [35]:
def tokenize_cleaning(text):
    # tokenize the text first
    tokens = word_tokenize(text.decode('utf-8'))
    
    # lowercase all the words
    tokens = [w.lower() for w in tokens]
    
    # clean up stop words and punctuations 
    stop_list = stopwords.words('english') + list(string.punctuation)

    tokens_no_stop = [token for token in tokens
                        if token not in stop_list]            
    
#     # extract stem of the words
#     stemmer = PorterStemmer()
#     tokens_stem = [stemmer.stem(token) for token in tokens_no_stop]

    # use lemma instead
    # reason: remove the influence of plural or tense
    # but retain the subtle difference in legal writting
    lemmatizer = WordNetLemmatizer()
    tokens_lemma = [lemmatizer.lemmatize(token) for token in tokens_no_stop]
    
    # remove numbers (the actual values are not useful)
    tokens_no_num = []
    for token in tokens_lemma:
        try:
            float(token)
        except:
            tokens_no_num.append(token)
    
    return tokens_no_num

In [26]:
# tokenize_clean the abstracts and count the occurence of the words

cleaned_text = []
for i in range(abstracts.shape[0]):
    tokens = tokenize_cleaning(abstracts['abstract'].iloc[i])
    cleaned_text.append(' '.join(word for word in tokens))

In [82]:
# convert to bag-of-words
# min value set by inspecting the words
vectorizer = CountVectorizer(max_df = 0.5, min_df=2)
#vectorizer = CountVectorizer()

# performe a count-based vectorization of the document
word_vec = vectorizer.fit(cleaned_text)
word_counts = word_vec.transform(cleaned_text)
# convert to array
word_counts = word_counts.toarray()

word_counts.shape

(12033, 15167)

In [83]:
# still need to remove some number words 
# due to how CountVectorizer treats '-' and '/'

# remove any word with numbers in it
words = word_vec.get_feature_names()
num_word_index = np.zeros(len(words))

for i in range(len(words)):
    word = words[i]
    for j in range(len(word)):
        try:
            float(word[j])
            num_word_index[i] = 1
            break
        except:
            continue
        
print "Number of numerical words detected: ", int(np.sum(num_word_index))

Number of numerical words detected:  447


In [84]:
# remove the number words
words_no_num = np.asarray(words)[num_word_index == 0]
word_counts = word_counts[:, num_word_index == 0]

word_counts.shape

(12033, 14720)

In [88]:
# # check the distribution of word occurance
# total_counts = np.sum(word_counts, axis = 0)
# plt.hist(np.transpose(total_counts))

In [86]:
# # inspect the words
# df = pd.DataFrame({
#     'word': words_no_num,
#     'count': total_counts
# })

# df.sort('count')

In [115]:
# tf_vectorizer = TfidfVectorizer()
# tfidf_article = tf_vectorizer.fit_transform(cleaned_text)
# tfidf_article = tfidf_article.toarray()

# print tfidf_article.shape

(12033, 29829)


#### Spliting train-test data and subsampling

In [91]:
# split train and test sets
x_train, x_test, y_train, y_test = train_test_split(word_counts, y_data, 
                                                    test_size = 0.2, 
                                                    random_state = 123)

print "Dataset dimensions:"
print "x_train: ", x_train.shape
print "x_test: ", x_test.shape
print "y_train: ", y_train.shape
print "y_test: ", y_test.shape

Dataset dimensions:
x_train:  (9626, 14720)
x_test:  (2407, 14720)
y_train:  (9626,)
y_test:  (2407,)


In [92]:
### subsampling the training data
# sample the same number of'useful' patents as the 'not useful' patents
# size of each class
num_size = np.sum(y_train == 0)

#random shuffle the rows
n = x_train.shape[0]
perm = range(n)
np.random.shuffle(perm)

x_train = x_train[perm]
y_train = y_train[perm]

# separate the two classes
x_useful = x_train[y_train == 1, :]
x_not_useful = x_train[y_train == 0, :]
y_useful = y_train[y_train == 1]
y_not_useful = y_train[y_train == 0]

# sample num_size from the 'useful' class
x_useful = x_useful[:num_size]
y_useful = y_useful[:num_size]

# combine the two classes
x_train_sub = np.concatenate((x_useful, x_not_useful), axis = 0)
y_train_sub = np.concatenate((y_useful, y_not_useful), axis = 0)

# shuffle again
# shuffle the combined data
n2 = x_train_sub.shape[0]
perm2 = range(n2)
np.random.shuffle(perm2)

x_train_sub = x_train_sub[perm2]
y_train_sub = y_train_sub[perm2]

# check the size
print x_train_sub.shape
print y_train_sub.shape

(7136, 14720)
(7136,)


In [93]:
# try Naive Bayes with Gaussian Distribution
gnb = GaussianNB()

# fit on the training data
gnb.fit(x_train_sub, y_train_sub)

GaussianNB(priors=None)

In [94]:
# predict on the test data
y_pred = gnb.predict(x_test)

In [95]:
# accuracy
np.mean(y_pred == y_test)

0.54881595346904866

In [96]:
metrics.confusion_matrix(y_test, y_pred)

array([[580, 322],
       [764, 741]])

### PCA

In [97]:
# try dimensionality reduction using PCA
pca = PCA()

x_train_pca = pca.fit_transform(x_train_sub)
x_test_pca = pca.transform(x_test)

In [98]:
# find the cum-variance explained at each level
total_var = np.cumsum(pca.explained_variance_ratio_)
n_pc = np.where((total_var > 0.9) == True)[0][0]

In [99]:
n_pc

1636

In [100]:
### use logistic regression

# call the model function
model = Log()
# parameter tuning
c =  np.logspace(-5, 5, 11)

# use grid search with 5-fold CV
grid_model = GridSearchCV(model, param_grid = {'C': c}, cv  = 5, scoring = 'accuracy')
# fit on the data
grid_model = grid_model.fit(x_train_pca[:, :n_pc], y_train_sub) 

In [101]:
# check results
print "Best accuracy:", grid_model.best_score_
print "Best parameter: ", grid_model.best_params_

Best accuracy: 0.55591367713
Best parameter:  {'C': 0.01}


In [104]:
# check confusion matrix
best_log = grid_model.best_estimator_
best_log.fit(x_train_pca[:, :n_pc], y_train_sub)
y_pred = best_log.predict(x_test_pca[:, :n_pc])

# accuracy
print "Test accuracy: ", np.mean(y_pred == y_test)
print "F1 score: ", metrics.f1_score(y_test, y_pred)
print "Precision: ", metrics.precision_score(y_test, y_pred)
print "Recall: ", metrics.recall_score(y_test, y_pred)
metrics.confusion_matrix(y_test, y_pred)

Test accuracy:  0.546738678853
F1 score:  0.586272279105
Precision:  0.682862190813
Recall:  0.513621262458


array([[543, 359],
       [732, 773]])

In [107]:
### tune random forest

model = RandomForestClassifier(n_estimators = 100)

# tune max_features
param_space = np.arange(10, 80, 10)

grid_model = GridSearchCV(model, n_jobs = 4, 
                          param_grid = {'max_features': param_space}, 
                          cv  = 5, scoring = 'accuracy')
# fit on the data
grid_model = grid_model.fit(x_train_pca[:, :n_pc], y_train_sub)
#grid_model = grid_model.fit(x_train_sub, y_train_sub)

In [108]:
# check results
print "Best accuracy:", grid_model.best_score_
print "Best parameter: ", grid_model.best_params_

Best accuracy: 0.563340807175
Best parameter:  {'max_features': 50}


In [109]:
# check confusion matrix
best_rf = grid_model.best_estimator_
best_rf.fit(x_train_pca[:, :n_pc], y_train_sub)
y_pred = best_rf.predict(x_test_pca[:, :n_pc])

# accuracy
print "Test accuracy: ", np.mean(y_pred == y_test)
print "F1 score: ", metrics.f1_score(y_test, y_pred)
print "Precision: ", metrics.precision_score(y_test, y_pred)
print "Recall: ", metrics.recall_score(y_test, y_pred)
metrics.confusion_matrix(y_test, y_pred)

Test accuracy:  0.553801412547
F1 score:  0.606304985337
Precision:  0.676206050695
Recall:  0.54950166113


array([[506, 396],
       [678, 827]])

Try word2vec, meaning on the document level;
can combine text features with non-text features


## NLP: 
text to number;

### BAG-OF-WORDS approach:
TF-IDF: scale the numbers;
you can hard code some key words as features, otherwise IF-IDF may decrease performance.

**Topic modeling**: only at the document level
Each document is a distribution over topics; each topic is a distribution over terms/words.
Matrix: rows are documents, columns are topics, row sum up to 1.
Topics: matirx as well. But you need to assign topic label to train the model. Need to set the number of topics. 
Basic model: LDA (latent Dirichlet)

### WORD2VEC (word embedding, there is also GLOVE)
Gensin is the library to use!
spxy (harder to learn)
NLTK (not great but easy to use)

continuous bag-of-words, or the other method to predict the word

there are pretrained models on Google news or wikipedia

Can do manual inspection to check the models

features: numbers representing the word/doc meaning 

to documents: take sum or mean as the features

**Doc2vec**: document level, or summarize the key words and do word2vec

shuffle the order of sentenses and train the model again, if corpus is very small. 

can pick up spelling problems.

Gensin takes list of lists

PROCESSES:
smaller corpus requires more preprocessing;
1. split sentenses/words
2. stemming (large documents do not care much)
3. stop words
4. punctuation: ', ", 
5. numbers (no need to remove does not need to touch it)
6. lowercase everything

other features to consider: how long the patent is. it is easier to interpret. 

tSNE is preferred over PCA on text analysis



#### try balanced testing set

In [43]:
### subsampling the entire data
# sample the same number of'useful' patents as the 'not useful' patents
# size of each class
num_size = np.sum(y_data == 0)

#random shuffle the rows
n = word_counts.shape[0]
perm = range(n)
np.random.shuffle(perm)

x_data = word_counts[perm]
y_data = y_data[perm]

# separate the two classes
x_useful = x_data[y_data == 1, :]
x_not_useful = x_data[y_data == 0, :]
y_useful = y_data[y_data == 1]
y_not_useful = y_data[y_data == 0]

# sample num_size from the 'useful' class
x_useful = x_useful[:num_size]
y_useful = y_useful[:num_size]

# combine the two classes
x_data_sub = np.concatenate((x_useful, x_not_useful), axis = 0)
y_data_sub = np.concatenate((y_useful, y_not_useful), axis = 0)

# shuffle again
# shuffle the combined data
n2 = x_data_sub.shape[0]
perm2 = range(n2)
np.random.shuffle(perm2)

x_data_sub = x_data_sub[perm2]
y_data_sub = y_data_sub[perm2]

# check the size
print x_data_sub.shape
print y_data_sub.shape

(8940, 1023)
(8940,)


In [44]:
# split train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data_sub, y_data_sub, 
                                                    test_size = 0.2, 
                                                    random_state = 123)

print "Dataset dimensions:"
print "x_train: ", x_train.shape
print "x_test: ", x_test.shape
print "y_train: ", y_train.shape
print "y_test: ", y_test.shape

Dataset dimensions:
x_train:  (7152, 1023)
x_test:  (1788, 1023)
y_train:  (7152,)
y_test:  (1788,)


In [None]:
### tune random forest

model = RandomForestClassifier(n_estimators = 100)

# tune max_features
param_space = np.arange(2, 15, 2)

grid_model = GridSearchCV(model, n_jobs = 4, 
                          param_grid = {'max_features': param_space}, 
                          cv  = 5, scoring = 'accuracy')
# fit on the data
# grid_model = grid_model.fit(x_train_pca[:, :n_pc], y_train_sub)
grid_model = grid_model.fit(x_train, y_train)

In [45]:
# check results
print "Best accuracy:", grid_model.best_score_
print "Best parameter: ", grid_model.best_params_

Best accuracy: 0.509927293065
Best parameter:  {'max_features': 6}


In [42]:
# check confusion matrix
best_rf = grid_model.best_estimator_
best_rf.fit(x_train, y_train)
y_pred = best_rf.predict(x_test)

# accuracy
print "Test accuracy: ", np.mean(y_pred == y_test)
print "F1 score: ", metrics.f1_score(y_test, y_pred)
print "Precision: ", metrics.precision_score(y_test, y_pred)
print "Recall: ", metrics.recall_score(y_test, y_pred)
metrics.confusion_matrix(y_test, y_pred)

Test accuracy:  0.507829977629
F1 score:  0.499431171786
Precision:  0.513450292398
Recall:  0.486157253599


array([[469, 416],
       [464, 439]])