In [1]:
import numpy as np
import pandas as pd
import pickle
# SQL related packages
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
# sklearn packages
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics, preprocessing
from sklearn.linear_model import LogisticRegression as Log
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, GridSearchCV
# text analysis packages
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# access to sql database
dbname = 'patent_db'
username = 'jy'
pswd = 'jy'

engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))

# reading from sql database
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

#### Extract abstracts first

In [3]:
# read data from 2004-2007
years = np.arange(2004, 2008)

# dataframe to store the results
abstracts = pd.DataFrame()

# import the abstract from each table
for year in years:
    # query:
    sql_query = """
    SELECT abstract, id, payment_times
        FROM patents_%s;
    """ %str(year)

    results = pd.read_sql_query(sql_query,con)
    
    abstracts = pd.concat([abstracts, results], axis = 0)
    
# check size of the data
abstracts.shape

(12033, 3)

In [4]:
# extract the response variable
# reformat the response variable into binary
y_data = np.zeros(abstracts.shape[0])
y_data[abstracts['payment_times'].values >= 2] = 1

print "Percentage of patents with > 1 maintenance fee payments: ", np.mean(y_data)

Percentage of patents with > 1 maintenance fee payments:  0.628521565694


#### Spliting train-test data first
Perform tokenization and other preprocessing on the training data alone

In [6]:
x_data = abstracts['abstract'].values

# split train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, 
                                                    test_size = 0.2, 
                                                    random_state = 123)

print "Dataset dimensions:"
print "x_train: ", x_train.shape
print "x_test: ", x_test.shape
print "y_train: ", y_train.shape
print "y_test: ", y_test.shape

Dataset dimensions:
x_train:  (9626,)
x_test:  (2407,)
y_train:  (9626,)
y_test:  (2407,)


#### Tokenization and cleaning

In [7]:
def tokenize_cleaning(text):
    # tokenize the text first
    tokens = word_tokenize(text.decode('utf-8'))
    
    # lowercase all the words
    tokens = [w.lower() for w in tokens]
    
    # clean up stop words and punctuations 
    stop_list = stopwords.words('english') + list(string.punctuation)

    tokens_no_stop = [token for token in tokens
                        if token not in stop_list]            
    
#     # extract stem of the words
#     stemmer = PorterStemmer()
#     tokens_stem = [stemmer.stem(token) for token in tokens_no_stop]

    # use lemma instead
    # reason: remove the influence of plural or tense
    # but retain the subtle difference in legal writting
    lemmatizer = WordNetLemmatizer()
    tokens_lemma = [lemmatizer.lemmatize(token) for token in tokens_no_stop]
    
    # remove numbers (the actual values are not useful)
    tokens_no_num = []
    for token in tokens_lemma:
        try:
            float(token)
        except:
            tokens_no_num.append(token)
    
    return tokens_no_num

In [9]:
# tokenize_clean the training text and testing text separately

cleaned_train = []
for i in range(x_train.shape[0]):
    tokens = tokenize_cleaning(x_train[i])
    cleaned_train.append(' '.join(word for word in tokens))
    
cleaned_test = []
for i in range(x_test.shape[0]):
    tokens = tokenize_cleaning(x_test[i])
    cleaned_test.append(' '.join(word for word in tokens))

In [10]:
# convert to bag-of-words
# min number selected by examining the low-frequency words
vectorizer = CountVectorizer(max_df = 0.5, min_df=2)

# perform a count-based vectorization of the document
word_vec = vectorizer.fit(cleaned_train)
word_counts_train = word_vec.transform(cleaned_train)
word_counts_test = word_vec.transform(cleaned_test)

# convert to array
word_counts_train = word_counts_train.toarray()
word_counts_test = word_counts_test.toarray()

print "Training counts dimension: ", word_counts_train.shape
print "Testing counts dimension: ", word_counts_test.shape

Training counts dimension:  (9626, 13432)
Testing counts dimension:  (2407, 13432)


In [11]:
# still need to remove some number words 
# due to how CountVectorizer treats '-' and '/'

# remove any word with numbers in it
words = word_vec.get_feature_names()
num_word_index = np.zeros(len(words))

for i in range(len(words)):
    word = words[i]
    for j in range(len(word)):
        try:
            float(word[j])
            num_word_index[i] = 1
            break
        except:
            continue
        
print "Number of numerical words detected: ", int(np.sum(num_word_index))

# remove the number words
words_no_num = np.asarray(words)[num_word_index == 0]
word_counts_train = word_counts_train[:, num_word_index == 0]
word_counts_test = word_counts_test[:, num_word_index == 0]

print "Training counts dimension: ", word_counts_train.shape
print "Testing counts dimension: ", word_counts_test.shape

Number of numerical words detected:  358
Training counts dimension:  (9626, 13074)
Testing counts dimension:  (2407, 13074)


In [10]:
# # check the distribution of word occurance
# total_counts = np.sum(word_counts, axis = 0)
# #plt.hist(np.transpose(total_counts))

In [12]:
# # inspect the words
# df = pd.DataFrame({
#     'word': words_no_num,
#     'count': total_counts
# })

# df.sort('count')

#### Try TF-IDF

In [27]:
# apply IF-IDF
tf_vectorizer = TfidfVectorizer()
tf_vec = tf_vectorizer.fit(cleaned_train)
tfidf_train = tf_vec.transform(cleaned_train)
tfidf_test = tf_vec.transform(cleaned_test)
# convert to array
tfidf_train = tfidf_train.toarray()
tfidf_test = tfidf_test.toarray()

print tfidf_train.shape

(9626, 24551)


In [28]:
### subsampling the training data
# sample the same number of'useful' patents as the 'not useful' patents
# size of each class
num_size = np.sum(y_train == 0)

#random shuffle the rows
n = tfidf_train.shape[0]
perm = range(n)
np.random.shuffle(perm)

tfidf_train = tfidf_train[perm]
y_train = y_train[perm]

# separate the two classes
x_useful = tfidf_train[y_train == 1, :]
x_not_useful = tfidf_train[y_train == 0, :]
y_useful = y_train[y_train == 1]
y_not_useful = y_train[y_train == 0]

# sample num_size from the 'useful' class
x_useful = x_useful[:num_size, :]
y_useful = y_useful[:num_size]

# combine the two classes
x_train_sub = np.concatenate((x_useful, x_not_useful), axis = 0)
y_train_sub = np.concatenate((y_useful, y_not_useful), axis = 0)

# shuffle again
# shuffle the combined data
n2 = x_train_sub.shape[0]
perm2 = range(n2)
np.random.shuffle(perm2)

x_train_sub = x_train_sub[perm2]
y_train_sub = y_train_sub[perm2]

# check the size
print x_train_sub.shape
print y_train_sub.shape

(7136, 24551)
(7136,)


In [13]:
# # standardize the predictors
# scaler = preprocessing.StandardScaler()

# x_train_std = scaler.fit_transform(x_train_sub)
# x_test_std = scaler.transform(word_counts_test)



In [29]:
# try Naive Bayes with Gaussian Distribution
# no need to normalize for it
gnb = GaussianNB()

# fit on the training data
gnb.fit(x_train_sub, y_train_sub)

# predict on the test data
y_pred = gnb.predict(tfidf_test)

# accuracy
print "Testing accuracy: ", np.mean(y_pred == y_test)
print "Confustion matrix:"
metrics.confusion_matrix(y_test, y_pred)

Testing accuracy:  0.504777731616
Confustion matrix:


array([[475, 427],
       [765, 740]])

### PCA

In [30]:
# try dimensionality reduction using PCA
pca = PCA()

x_train_pca = pca.fit_transform(x_train_sub)
x_test_pca = pca.transform(tfidf_test)

In [31]:
# find the cum-variance explained at each PC
total_var = np.cumsum(pca.explained_variance_ratio_)
n_pc = np.where((total_var > 0.9) == True)[0][0]

print "The number of PCs that can explain > 90% variability: ", n_pc

The number of PCs that can explain > 90% variability:  3439


In [32]:
### use logistic regression

# call the model function
model = Log()
# parameter tuning
c =  np.logspace(-4, 4, 9)

# use grid search with 5-fold CV
grid_model = GridSearchCV(model, param_grid = {'C': c}, cv  = 5, scoring = 'accuracy')
# fit on the data
grid_model = grid_model.fit(x_train_pca[:, :n_pc], y_train_sub) 

In [33]:
# check results
print "Best accuracy:", grid_model.best_score_
print "Best parameter: ", grid_model.best_params_

Best accuracy: 0.497197309417
Best parameter:  {'C': 1000.0}


In [34]:
# check confusion matrix
best_log = grid_model.best_estimator_
best_log.fit(x_train_pca[:, :n_pc], y_train_sub)
y_pred = best_log.predict(x_test_pca[:, :n_pc])

# accuracy
print "Test accuracy: ", np.mean(y_pred == y_test)
print "F1 score: ", metrics.f1_score(y_test, y_pred)
print "Precision: ", metrics.precision_score(y_test, y_pred)
print "Recall: ", metrics.recall_score(y_test, y_pred)
metrics.confusion_matrix(y_test, y_pred)

Test accuracy:  0.479434981305
F1 score:  0.534373838722
Precision:  0.606239460371
Recall:  0.477740863787


array([[435, 467],
       [786, 719]])

In [35]:
### tune random forest

model = RandomForestClassifier(n_estimators = 100)

# tune max_features
param_space = np.arange(10, 80, 10)

grid_model = GridSearchCV(model, n_jobs = 4, 
                          param_grid = {'max_features': param_space}, 
                          cv  = 5, scoring = 'accuracy')
# fit on the data
grid_model = grid_model.fit(x_train_pca[:, :n_pc], y_train_sub)
#grid_model = grid_model.fit(x_train_sub, y_train_sub)

In [36]:
# check results
print "Best accuracy:", grid_model.best_score_
print "Best parameter: ", grid_model.best_params_

Best accuracy: 0.502802690583
Best parameter:  {'max_features': 60}


In [37]:
# check confusion matrix
best_rf = grid_model.best_estimator_
best_rf.fit(x_train_pca[:, :n_pc], y_train_sub)
y_pred = best_rf.predict(x_test_pca[:, :n_pc])

# accuracy
print "Test accuracy: ", np.mean(y_pred == y_test)
print "F1 score: ", metrics.f1_score(y_test, y_pred)
print "Precision: ", metrics.precision_score(y_test, y_pred)
print "Recall: ", metrics.recall_score(y_test, y_pred)
metrics.confusion_matrix(y_test, y_pred)

Test accuracy:  0.510178645617
F1 score:  0.559252336449
Precision:  0.639316239316
Recall:  0.497009966777


array([[480, 422],
       [757, 748]])

Try word2vec, meaning on the document level;
can combine text features with non-text features


## NLP: 
text to number;

### BAG-OF-WORDS approach:
TF-IDF: scale the numbers;
you can hard code some key words as features, otherwise IF-IDF may decrease performance.

**Topic modeling**: only at the document level
Each document is a distribution over topics; each topic is a distribution over terms/words.
Matrix: rows are documents, columns are topics, row sum up to 1.
Topics: matirx as well. But you need to assign topic label to train the model. Need to set the number of topics. 
Basic model: LDA (latent Dirichlet)

### WORD2VEC (word embedding, there is also GLOVE)
Gensin is the library to use!
spxy (harder to learn)
NLTK (not great but easy to use)

continuous bag-of-words, or the other method to predict the word

there are pretrained models on Google news or wikipedia

Can do manual inspection to check the models

features: numbers representing the word/doc meaning 

to documents: take sum or mean as the features

**Doc2vec**: document level, or summarize the key words and do word2vec

shuffle the order of sentenses and train the model again, if corpus is very small. 

can pick up spelling problems.

Gensin takes list of lists

PROCESSES:
smaller corpus requires more preprocessing;
1. split sentenses/words
2. stemming (large documents do not care much)
3. stop words
4. punctuation: ', ", 
5. numbers (no need to remove does not need to touch it)
6. lowercase everything

other features to consider: how long the patent is. it is easier to interpret. 

tSNE is preferred over PCA on text analysis

