# Data preprocessing

In [1]:
import pandas as pd
import numpy as np
import json

claim_path = "../data/out_claim.zip"
title_path = "../data/out_title.zip"
description_path = "../data/out_descr.zip"

Load all the data into the dataframe `df`. Join all the different dataframes (claims, titles, descriptions) into a unique dataframe.

In [2]:
df = pd.read_csv(claim_path, compression="zip") \
         .drop(columns=["Unnamed: 0", "Language", "PatenType", "PublicationType", "Language", "Part", "Number"]) \
         .rename(columns={"Contents": "claims"}).rename(str.lower, axis="columns") \
         .set_index("patentnumber")

In [3]:
title = pd.read_csv(title_path, compression="zip") \
          .drop(columns=["Unnamed: 0", "Language", "PatenType", "PublicationType", "Language", "Part", "Number", "Date"]) \
          .rename(columns={"Contents": "title"}).rename(str.lower, axis="columns") \
          .set_index("patentnumber")

df = df.join(title)
del title

In [4]:
desc = pd.read_csv(description_path, compression="zip") \
          .drop(columns=["Unnamed: 0", "Language", "PatenType", "PublicationType", "Language", "Part", "Number", "Date"]) \
          .rename(columns={"Contents": "description"}).rename(str.lower, axis="columns") \
          .set_index("patentnumber")

df = df.join(desc)
del desc

In [5]:
df.head()

Unnamed: 0_level_0,date,claims,title,description
patentnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3000006,2018-02-28,['A complementary metal oxide semiconductor vo...,"ALL-CMOS, LOW-VOLTAGE, WIDE-TEMPERATURE RANGE,...","{'BACKGROUND OF THE INVENTION': [], 'FIELD OF ..."
3000007,2020-07-08,['A method for configuring a user interface of...,SYSTEM AND METHOD FOR OPTIMIZED APPLIANCE CONTROL,"{'BACKGROUND': ['Controlling devices, for exam..."
3000011,2017-05-03,['A method (400) of positioning one or more vi...,BODY-LOCKED PLACEMENT OF AUGMENTED REALITY OBJ...,{'BACKGROUND': ['An augmented reality computin...
3000012,2019-05-01,['A method of displaying a schedule in a weara...,METHOD AND APPARATUS FOR DISPLAYING SCHEDULE O...,{'Technical Field': ['The present disclosure r...
3000013,2020-05-06,['A remote controller adapted to interact with...,INTERACTIVE MULTI-TOUCH REMOTE CONTROL,{'BRIEF DESCRIPTION OF THE SEVERAL VIEWS OF TH...


In [6]:
df.shape

(43182, 4)

In [7]:
df = df.dropna()
df.shape

(43182, 4)

We have quite a lot of documents to process and each document contains a lot of text internally, we will process descriptions in batches to allow every machine to handle the amount of data despite the installed memory.

## Summary extraction

Let's try to extract summaries from all documents first.

In [None]:
df["summaries"] = df["description"].apply(
    lambda r: [eval(r)[k] for k in eval(r).keys() if 'summary' in k.lower()],
    convert_dtype=False)

In [None]:
# keep only 1 summary
df = df[df['summaries'].apply(len) == 1]

In [None]:
# explode summaries newlines in 1 text
df['summaries'] = df['summaries'].apply(lambda r: ' '.join(r[0]))

In [None]:
df['summary_title'] = df["description"].apply(
    lambda r: [k.lower() for k in eval(r).keys() if 'summary' in k.lower()][0],
    convert_dtype=False)

In [None]:
import re
from functools import reduce
from nltk.corpus import stopwords


REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z ]')

def replace_special_characters(text: str) -> str:
    """
    Replaces special characters, such as paranthesis,
    with spacing character
    """
    return REPLACE_BY_SPACE_RE.sub(' ', text)

def filter_out_uncommon_symbols(text: str) -> str:
    """
    Removes any special character that is not in the
    good symbols list (check regular expression)
    """
    return GOOD_SYMBOLS_RE.sub('', text)

def filter_out_stopwords(text):
    return ' '.join([w for w in text.split() if w not in stopwords.words('english')])

def strip_text(text: str) -> str:
    """
    Removes any left or right spacing (including carriage return) from text.
    Example:
    Input: '  This assignment is cool\n'
    Output: 'This assignment is cool'
    """
    return text.strip()

PREPROCESSING_PIPELINE = [
  replace_special_characters,
  filter_out_uncommon_symbols,
  strip_text
]

# Anchor method

def text_prepare(text: str,
                 filter_methods = None) -> str:
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """
    filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE
    return reduce(lambda txt, f: f(txt), filter_methods, text)

In [None]:
with open('annotated.json') as f:
    annotated = json.load(f)


In [None]:
good_headings = []

for key in annotated.keys():
    if annotated[key] == '1':
        good_headings.append(key)

good_headings

In [None]:
df = df[df['summary_title'].apply(lambda r: r in good_headings)]

## Dataset creation

In [None]:
dataset = df[['summaries', 'claims']]

In [None]:
dataset['claims'] = dataset['claims'].apply(lambda r: eval(r)[0])

In [None]:
import re
from functools import reduce

REPLACE_BY_SPACE_RE = re.compile('\(\d+\)')

def filter_out_numbers(text: str) -> str:
    return REPLACE_BY_SPACE_RE.sub('', text)

def lower_text(text: str) -> str:
    return text.lower()

def strip_text(text: str) -> str:
    return text.strip()

PREPROCESSING_PIPELINE = [
  strip_text,
  lower_text,
  filter_out_numbers,
]

# Anchor method

def dataset_text_prepare(text: str,
                 filter_methods = None) -> str:
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """
    filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE
    return reduce(lambda txt, f: f(txt), filter_methods, text)

In [None]:
dataset['summaries'] = dataset['summaries'].apply(dataset_text_prepare)
dataset['claims'] = dataset['claims'].apply(dataset_text_prepare)

In [None]:
dataset

In [None]:

with open('title.json', 'w') as f:
    json.dump(list(titles), f)