### Patent Corpus Text Cleaning

* Patent are a special kind of textual data that contain plenty of technical terms, specific words serving as transition phrases, and numerous academic words that describe invention outcomes. Before modeling topics with LDA, we also utilize three modules to remove general words from the corpus of patents as follows:
    * Stop words such as the, that, and these;
    * High-frequency words in patent claims such as claimed, comprising, and invention;
    * General academic words such as research, approach, and data.

**Among all the terms only technical terms provide the most meaningful information that reflects technological topics and innovations.** 



In [None]:
import os
import pandas as pd 
os.chdir('/Users/sheeroh/Box Sync/2_projects/insightDSNYC/data/')
from loadData_workflow import loadData

In [None]:
#load the saved pickle file patent document
patdocs = pd.read_pickle( 'patdocs_clean.pkl')

#check if all data has been successfully loaded
#patdocs.head(2)

###  Text Cleaning 

In [None]:
from nltk.corpus import stopwords # Import the stop word list
import nltk
import re

def patent_to_words( raw_review ):
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    lower_case = letters_only.lower()        # Convert to lower case
    words = lower_case.split()               # Split into words
    from nltk.corpus import stopwords # Import the stop word list
    #print (stopwords.words("english"))
    words = [w for w in words if not w in stopwords.words("english")] 
    #print (words)
    xx = stopwords.words("english")
    # Add first, second and one
    xx.extend(["first","second","one","two","also","may","least","present","determine",
    "included","includes","include","provided","provides","wherein","method","methods",
    "comprises","comprised","comprising","used","uses","using","use","say","says","said","disclose","discloses","disclosed",
    "containing","contain","contains","contained","make","made","makes","end","couple","relates", 'invention','including',
    "b","c","d", 'new','described', 'gift', 'A', "research", 'group', 'according',"approach", 'data', 'system', 'x', 'claimed', 'claim', 'therein'])
    stops = set(xx)               
    #Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    return( " ".join( meaningful_words)) 

In [None]:
patent_to_words(patdocs["abstract"][0])

### Clean and Tokenize patents into lists (each patent is a words array)
#Saved clean_abstract to file so as to pick it up from here later on.

In [None]:
import numpy as np
import json

# Get the number of patents based on the dataframe column size
num_patents = patdocs["abstract"].size
print(num_patents)

# Initialize an empty list to hold the clean reviews
clean_abstract = []

# Loop over each review; create an index i that goes from 0 to the length of the patent list 
for i in range( 0, num_patents ):
    patent = patent_to_words(patdocs["abstract"][i])
    if i%10000==0:print(i)
    array = patent.split()
    clean_abstract.append(array)
with open('clean_abstract.txt', 'w') as outfile:
    json.dump(clean_abstract, outfile)    
    

Next, using gensim's Phrases function, we create additional bigrams to include in the topic modeling. We used [12] as a reference to create the bigrams.

In [None]:
from gensim.models.phrases import Phrases
import gensim
from gensim import corpora, models, similarities
from gensim.corpora import Dictionary
# Identify Bigrams using gensim's Phrases function

#bigram = Phraser(phrases)
bigram = models.Phrases(clean_abstract, delimiter=b'_')
print(bigram)

final_abstract = []
for i in range(0,num_patents):
    sent = clean_abstract[i] 
    temp_bigram = bigram[sent]
    final_abstract.append(temp_bigram)
    #if i%10000==0:print(i)
final_column = pd.Series(final_abstract)
   
#list(bigram[final_abstract])
#print(bigram[sent])
#len(bigram.vocab)

In [None]:
# create a new column final_column
final_column = pd.Series(final_abstract)

#print(final_column)
patdocs['final_column']= final_column.values
patdocs.head(2)
patdocs.to_csv('patdocs_final_column.csv')

### Convert tokenized document to dictionary and document-term matrix

In [None]:
os.chdir("model")

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(final_abstract)
dictionary.save('abstract_new.dict') # store the dictionary, for future reference-
     
# convert tokenized documents into a document-term matrix (bag-of-words)
corpus = [dictionary.doc2bow(text) for text in final_abstract]
corpora.MmCorpus.serialize('abstract_new.mm', corpus)

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

#print(corpus)
#mm_corpus = gensim.corpora.MmCorpus('abstract.mm')
#print(mm_corpus)

In [None]:
len(final_abstract)-1000

### Split the data into training and text

In [None]:
import random
import pickle
random.seed(7)# 42 is not always th2 answer, let's try something different :)

train_set = random.sample(list(range(0,len(final_abstract))),len(final_abstract)-1000)
test_set = [x for x in list(range(0,len(final_abstract))) if x not in train_set]

train_texts = [final_abstract[i] for i in train_set]
test_texts = [final_abstract[i] for i in test_set]

pickle.dump([train_set,test_set,train_texts,test_texts],open('./abstract_train_test_sets_new.pkl','wb'))
