In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [3]:
import os
import sys

In [5]:
library_path = os.environ.get("NLP_PATENT_PATH")
if library_path not in sys.path:
    sys.path.append(library_path)

In [11]:
%aimport settings

In [57]:
import json

from docx import Document
import re
import xml.dom.minidom
import zipfile
from pathlib import Path

In [16]:
from settings import DATA_DIR

In [93]:
def get_text(filename_or_doc):
    doc = filename_or_doc
    if isinstance(doc, (str, Path)):
        doc = Document(filename_or_doc)
    full_text = []
    for p in doc.paragraphs:
        full_text.append(p.text)
    return '\n'.join(full_text)

def print_json(data):
    print(json.dumps(data, indent=4))

### Read the data

In [91]:
json_files = list(path.glob("*json"))
data = {}
for file in json_files:
    key = file.name.split(".")[1].strip().lower()
    with file.open() as f:
        data[key] = json.load(f)

print(data.keys())

dict_keys(['context', 'before_analyze', 'context_request', 'after_analyze'])


In [59]:
path = DATA_DIR / "pipeline_sample"
fname = list(path.glob("*docx"))[0]
text = get_text(fname)

In [120]:
PATT = "WHAT IS CLAIMED IS:"
start = text.find(PATT)
total_length = len(text) - start
proportion = 0.2
end = start + int(proportion * total_length)
print(text[start:end])

WHAT IS CLAIMED IS:
A method of wireless communication performed by a user equipment (UE), comprising:
receiving, when operating in a network that provides synchronization signals at a plurality of locations in a channel bandwidth, a common search space (CSS) indicator for a synchronization signal, wherein the CSS indicator indicates that the synchronization signal is not associated with a CSS for a downlink control channel; and
determining a location of the CSS for the downlink control channel based at least in part on the CSS indicator.

The method of claim 1, wherein the CSS indicator is receiving in a physical broadcast channel (PBCH).

The method of claim 2, wherein the location of the CSS is determined based at least in part on the physical broadcast channel.

The method of claim 1, wherein the CSS indicator includes information identifying a location of another synchronization signal that is associated with the CSS; and
wherein the location of the CSS is determined based at leas

In [119]:
print_json(data["before_analyze"]["Claims"][:5])

[
    {
        "Preamble": "A method of wireless communication performed by a user equipment (UE), comprising:",
        "ClaimNumber": 1,
        "Limitations": [
            {
                "LimitationText": "receiving, when operating in a network that provides synchronization signals at a plurality of locations in a channel bandwidth, a common search space (CSS) indicator for a synchronization signal, wherein the CSS indicator indicates that the synchronization signal is not associated with a CSS for a downlink control channel; and",
                "Limitations": []
            },
            {
                "LimitationText": "determining a location of the CSS for the downlink control channel based at least in part on the CSS indicator.",
                "Limitations": []
            }
        ],
        "ClaimType": 0,
        "ParentClaimNumbers": [],
        "DeviceName": ""
    },
    {
        "Preamble": "The method of claim 1, wherein the CSS indicator is receiving in a

### Analyze what has changed

In [154]:
changed_keys = set()
for i, (x, y) in enumerate(zip(data["before_analyze"]["Claims"], 
                               data["after_analyze"]["Claims"])):    
    # check keys
    diff = x.keys() ^ y.keys()
    assert len(diff) == 0
    # check values
    for key in x.keys():
        k += 1
        before = x[key]
        after = y[key]
        if before != after:
            changed_keys.add(key)
            print(f"Claim: {repr(i)}, Key: {repr(key)}, "
                  f"Before: {repr(before)}, ",
                  f"After: {repr(after)}")

Claim: 0, Key: 'ClaimType', Before: 0,  After: 1
Claim: 0, Key: 'DeviceName', Before: '',  After: 'user equipment (UE)'
Claim: 1, Key: 'ClaimType', Before: 0,  After: 1
Claim: 1, Key: 'ParentClaimNumbers', Before: [],  After: [1]
Claim: 1, Key: 'DeviceName', Before: '',  After: 'method of claim 1'
Claim: 2, Key: 'ClaimType', Before: 0,  After: 1
Claim: 2, Key: 'ParentClaimNumbers', Before: [],  After: [2]
Claim: 2, Key: 'DeviceName', Before: '',  After: 'method of claim 2'
Claim: 3, Key: 'ClaimType', Before: 0,  After: 1
Claim: 3, Key: 'ParentClaimNumbers', Before: [],  After: [1]
Claim: 3, Key: 'DeviceName', Before: '',  After: 'method of claim 1'
Claim: 4, Key: 'ClaimType', Before: 0,  After: 1
Claim: 4, Key: 'ParentClaimNumbers', Before: [],  After: [4]
Claim: 4, Key: 'DeviceName', Before: '',  After: 'method of claim 4'
Claim: 5, Key: 'ClaimType', Before: 0,  After: 1
Claim: 5, Key: 'ParentClaimNumbers', Before: [],  After: [1]
Claim: 5, Key: 'DeviceName', Before: '',  After: 'meth

In [149]:
print(f"Changed keys: {list(changed_keys)}")

Changed keys: ['ParentClaimNumbers', 'ClaimType', 'DeviceName']
