<h2> Build Corpus </h2>

Initial data preperation stages. 

Inputs:

* A set of selected patents

Outputs:

* A corpus of patent documents saved as lists of lists.

In [4]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
#the stemmer requires a language parameter
snow_stemmer = SnowballStemmer(language='english')
import string
#from nltk.corpus import stopwords
#stop_words = stopwords.words('english')
#stop_words.extend(["from", "subject", "re", "edu", "use", "one", "first", "least", "configured",
#                   "may", "can","system", "includes", "methods", "device", "second",
#                   "plurality", "data", "method", "assembly", "describe", "process"
#                   "surface", "fig", "diagram", "drawing", "drawings" ,"disclosure ",
#                   "embodiments", "domain", "bsum", "par", "invention", "object",
#                  "claim", "application", "present", "regarding", "comprise", "comprising", "provide", "providing"])
import re
import gzip 
import json
from tqdm import tqdm
from gensim import corpora

In [2]:
inputs_path = "C:/Users/Hannes/Documents/Joe/TeamsAndText/Inputs"
intermediate_outputs_path = "C:/Users/Hannes/Documents/Joe/TeamsAndText/IntermediateOutputs"

In [3]:
selected_pats = pickle.load(open(f"{intermediate_outputs_path}/selected_patents.p", "rb"))
len(selected_pats)

876882

In [4]:
inventor_data = pd.read_csv(f"{inputs_path}/patent_inventor.tsv", sep="\t", 
                           dtype={"patent_id":str, "inventor_id":str},
                            usecols=["patent_id","inventor_id"])

inventor_data = inventor_data.drop_duplicates()
inventor_data.rename(columns={"patent_id":"patent_number"}, inplace=True)
inventor_data = inventor_data[inventor_data["patent_number"].isin(selected_pats)]
inventor_data.head()

Unnamed: 0,patent_number,inventor_id
3858083,3982318,fl:al_ln:holmstrand-1
3858084,3982318,fl:do_ln:hennenfent-1
3952818,4038107,fl:ge_ln:marr-1
3952819,4038107,fl:ge_ln:smith-83
3961040,4042864,fl:me_ln:norris-3


In [5]:
type(inventor_data.iloc[0]["patent_number"])

str

In [6]:
descriptions = pd.read_csv(f"{intermediate_outputs_path}/descriptions.csv",
                           usecols=["patent_id", "summary_text"], dtype={"patent_id":str, "summary_text":str})

descriptions.rename(columns={"patent_id":"patent_number"}, inplace=True)
descriptions.head()

Unnamed: 0,patent_number,summary_text
0,5132174,TECHNICAL FIELD \nThis invention relates to fu...
1,5169843,BACKGROUND OF THE INVENTION \n1. Field of Inve...
2,5129409,BACKGROUND OF THE INVENTION \nThe present inve...
3,5128346,TECHNICAL FIELD \nThis is a continuation-in-pa...
4,5101439,FIELD OF THE INVENTION \nThis invention relate...


In [7]:
print(len(descriptions))
descriptions.dropna(inplace=True)
print(len(descriptions))

876581
876580


In [8]:
descriptions = descriptions[descriptions["patent_number"].isin(set(inventor_data["patent_number"]))]
inventor_data = inventor_data[inventor_data["patent_number"].isin(set(descriptions["patent_number"]))]

In [9]:
print("Number of inventors ", len(np.unique(inventor_data["inventor_id"])))
print("Number of patents ", len(np.unique(descriptions["patent_number"])))

Number of inventors  549961
Number of patents  876535


In [10]:
inventor_data["unique_pat"] = inventor_data.groupby("patent_number").ngroup()
inventor_data.sort_values("patent_number", inplace=True)
inventor_data.head()

Unnamed: 0,patent_number,inventor_id,unique_pat
5894705,5077839,fl:st_ln:keller-13,0
5894747,5077870,fl:da_ln:bychinski-1,1
5894748,5077870,fl:le_ln:wood-3,1
5894749,5077870,fl:ma_ln:lindseth-1,1
5894750,5077870,fl:su_ln:nestegard-1,1


In [11]:
unique_pat_dict = inventor_data[["patent_number", "unique_pat"]].drop_duplicates()
unique_pat_dict = dict(zip(inventor_data["unique_pat"], inventor_data["patent_number"]))
unique_pat_dict

{0: '5077839',
 1: '5077870',
 2: '5077875',
 3: '5077878',
 4: '5077893',
 5: '5077898',
 6: '5077913',
 7: '5077918',
 8: '5077954',
 9: '5077955',
 10: '5077967',
 11: '5077968',
 12: '5077969',
 13: '5077970',
 14: '5077972',
 15: '5077978',
 16: '5077979',
 17: '5077981',
 18: '5077982',
 19: '5077985',
 20: '5078001',
 21: '5078007',
 22: '5078010',
 23: '5078016',
 24: '5078031',
 25: '5078033',
 26: '5078034',
 27: '5078051',
 28: '5078054',
 29: '5078057',
 30: '5078069',
 31: '5078070',
 32: '5078073',
 33: '5078084',
 34: '5078086',
 35: '5078087',
 36: '5078092',
 37: '5078101',
 38: '5078104',
 39: '5078111',
 40: '5078115',
 41: '5078130',
 42: '5078132',
 43: '5078136',
 44: '5078138',
 45: '5078139',
 46: '5078142',
 47: '5078158',
 48: '5078161',
 49: '5078162',
 50: '5078168',
 51: '5078173',
 52: '5078185',
 53: '5078209',
 54: '5078210',
 55: '5078242',
 56: '5078263',
 57: '5078272',
 58: '5078273',
 59: '5078274',
 60: '5078285',
 61: '5078288',
 62: '5078289',
 6

In [12]:
pickle.dump(unique_pat_dict, open(f"{intermediate_outputs_path}/unique_pat_dict.p", "wb"))

In [13]:
inv2doc = inventor_data.groupby('inventor_id')['unique_pat'].agg(list)
inv2doc = dict(inv2doc)
pickle.dump(inv2doc, open(f"{intermediate_outputs_path}/inv2doc.p", "wb"))
len(inv2doc) 

549961

In [14]:
doc2inv = inventor_data.groupby('unique_pat')['inventor_id'].agg(list)
doc2inv = dict(doc2inv)
pickle.dump(doc2inv, open(f"{intermediate_outputs_path}/doc2inv.p", "wb"))
len(doc2inv)

876535

In [15]:
def reflection_tokenizer(text):
    '''expects a string an returns a list of lemmatized tokens
        and removes the stop words. Tokens are lower cased and
        non- alphanumeric characters as well as numbers removed.
    '''
    text = text.lower()
    # tokenise words
    tokens = [word for word in word_tokenize(text)]
    #tokens = [s for s in tokens if s not in stop_words]
    # remove punctuation
    translator = str.maketrans(string.punctuation, " " * len(string.punctuation))
    tokens = [word.translate(translator) for word in tokens]
    # remove numbers
    tokens = [word for word in tokens if not re.search(r'\d', word)]
    # stem tokens
    tokens = [snow_stemmer.stem(t) for t in tokens]
    tokens = [s for s in tokens if len(s) >= 3]
    #tokens = [s for s in tokens if s not in patent_stop_words]
    tokens = [s.strip() for s in tokens]
    return tokens


descriptions = [reflection_tokenizer(doc) for doc in tqdm(descriptions["summary_text"])]

100%|████████████████████████████████████████████████████████████████████████| 876535/876535 [5:27:59<00:00, 44.54it/s]


In [16]:
dictionary = corpora.Dictionary()
dictionary.add_documents(descriptions)
dictionary.filter_extremes(no_below = 5, no_above=0.80, keep_n=None)
print(len(dictionary))
dictionary

332780


<gensim.corpora.dictionary.Dictionary at 0x1e5c2735a60>

In [17]:
corpus = [dictionary.doc2bow(doc) for doc in descriptions] 

In [19]:
corpus[0]

[(0, 1),
 (1, 1),
 (2, 2),
 (3, 1),
 (4, 4),
 (5, 1),
 (6, 4),
 (7, 1),
 (8, 1),
 (9, 2),
 (10, 1),
 (11, 1),
 (12, 2),
 (13, 1),
 (14, 2),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 2),
 (19, 1),
 (20, 2),
 (21, 2),
 (22, 3),
 (23, 2),
 (24, 1),
 (25, 2),
 (26, 1),
 (27, 27),
 (28, 3),
 (29, 10),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 2),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 2),
 (40, 10),
 (41, 1),
 (42, 1),
 (43, 3),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 2),
 (48, 2),
 (49, 1),
 (50, 1),
 (51, 5),
 (52, 5),
 (53, 2),
 (54, 1),
 (55, 5),
 (56, 1),
 (57, 1),
 (58, 20),
 (59, 2),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 1),
 (66, 1),
 (67, 1),
 (68, 7),
 (69, 1),
 (70, 1),
 (71, 6),
 (72, 7),
 (73, 6),
 (74, 1),
 (75, 1),
 (76, 1),
 (77, 1),
 (78, 2),
 (79, 1),
 (80, 1),
 (81, 3),
 (82, 13),
 (83, 1),
 (84, 2),
 (85, 2),
 (86, 1),
 (87, 2),
 (88, 1),
 (89, 1),
 (90, 1),
 (91, 1),
 (92, 2),
 (93, 1),
 (94, 2),
 (95, 1),
 (96, 1),
 (97, 2),
 (98, 1),
 (99, 3),
 (100

In [20]:
def save_json_gz(obj, filepath):

    json_str = json.dumps(obj)
    json_bytes = json_str.encode()
    with gzip.GzipFile(filepath, mode="w") as f:
        f.write(json_bytes)
        
save_json_gz(corpus, f"{intermediate_outputs_path}/corpus.json.gz")

In [21]:
pickle.dump(dictionary, open(f"{intermediate_outputs_path}/id2word.p", "wb"))