# Miniscule sample of production code (training pipeline)
---
This notebook is mainly used as a sandbox and to visualize results of code fragments before adding them to the production pipeline

<br>
<hr>
<br>

<a id='toc'></a>
### Table of Contents
[1. Create Dataframe from USPTO bulk data](#section-1)<br>
[2. Text Segmentation for Description Section](#section-2)<br>
[3. Append information about Chemical Structures in the patents](#section-3)<br>
[4. Text Preprocessing](#section-4)<br>
<br>
<hr>

<a id='section-1'></a>
### - Create Dataframe from USPTO bulk data

In [8]:
# Load dependencies
import xml.etree.ElementTree as ET
import xmltodict
import pandas as pd
import glob

In [9]:
# Define the pandas dataframe
cols = ['id', 'invention_title', 'abstract', 'claims', 'description', 'drawings_description', 'drawings_file_paths']
patents_df = pd.DataFrame(columns=cols)

# File counter
m = 0

# Loop through all folders and grab xml files
for folder in glob.glob('../Dataset/*'):
    
    # Select only main xml file (folder[11:35]) and ignore supplementary ones
    # that have different name pattern
    for _file in glob.glob(folder + '/' + folder[11:35] + '.XML'):
        
        # Taking a subgroup of only 20 files for experimentation purposes
        if m <= 20:
            # Parse xml tree
            tree = ET.parse(_file)
            root = tree.getroot()

            # Placeholder for text content
            abstract_text = ''
            claims_text = ''
            description_text = ''
            drawings_description_text = ''
            drawings_file_paths = []

            # Traverse XML tree and extract data we need
            if (root[0].tag == 'us-bibliographic-data-application'):

                # Extract document number as id
                _id = root[0].find('publication-reference').find('document-id').find('doc-number').text
                
                # Extract invention title
                invention_title = root[0].find('invention-title').text
                
                # Extract abstract
                abstract = root.find('abstract')
                
                # Extract claims
                claims = root.find('claims')
                
                # Extract all description
                description = root.find('description')
                
                # Extract drawings description (if present)
                if root.find('drawings') != None:
                    drawings_description = root.find('description').find('description-of-drawings')
                    
                # Extract drawings paths (if present)
                if root.find('drawings') != None:
                    drawings = root.find('drawings')

                # Store all paragraphs in the abstract section
                for child in abstract:
                    if (child.text != None):
                        abstract_text += child.text + '\n'

                # Store all paragraphs in the claims section
                for child in claims:
                    claims_text += ''.join(child.itertext()).replace('\n', ' ')
                    
                # Store all paragraphs in the description section
                for child in description:
                    description_text += ''.join(child.itertext()) + ' '
                    
                # Store all paragraphs in the drawings description section
                if drawings_description:
                    for child in drawings_description:
                        drawings_description_text += ''.join(child.itertext()) + ' '
                        
                # Store all drawings file paths
                if drawings:
                    for child in drawings:
                        drawings_file_paths.append(child[0].get('file'))

                # Write extracted content to dataframe
                patents_df = patents_df.append(pd.Series([_id, invention_title, abstract_text, claims_text, \
                                                          description_text, drawings_description_text, \
                                                         drawings_file_paths], index=cols), ignore_index=True)
                
        # Process only 20 files and break out of the loop
        else:
            break
    
        # File counter increment
        m += 1
    
# Show dataframe    
patents_df

Unnamed: 0,id,invention_title,abstract,claims,description,drawings_description,drawings_file_paths
0,20190151362,COMPOSITIONS AND METHODS OF CELLULAR IMMUNOTHE...,Disclosed herein are methods of treating a sub...,1. A method of treating a subject exhibiting ...,CROSS-REFERENCE This application is a continua...,BRIEF DESCRIPTION OF THE DRAWINGS The novel fe...,"[US20190151362A1-20190523-D00000.TIF, US201901..."
1,20190134132,NUTRITION BLEND FOR HEALTH BENEFITS IN ANIMALS,A method of minimizing fat accumulation in a g...,1. A method of minimizing fat accumulation in...,CROSS REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS The novel fe...,"[US20190151362A1-20190523-D00000.TIF, US201901..."
2,20190224135,USE OF PROCALCITONIN (PCT) IN RISK STRATIFICAT...,Subject of the present invention are assays an...,1. An in vitro method for prognosis for a pat...,Subject of the present invention is the in vit...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 shows...,"[US20190224135A1-20190725-D00001.TIF, US201902..."
3,20190153419,THROMBIN-THROMBOMODULIN FUSION PROTEINS AS A P...,Compositions and methods for regulating the bl...,1. A thrombin-thrombomodulin fusion protein c...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS The patent o...,"[US20190153419A1-20190523-D00000.TIF, US201901..."
4,20190169293,HANP-FC-CONTAINING MOLECULAR CONJUGATE,The present invention provides a conjugate com...,1. A conjugate comprising a hANP peptide bond...,TECHNICAL FIELD The present invention relates ...,BRIEF DESCRIPTION OF DRAWINGS FIG. 1 schematic...,"[US20190169293A1-20190606-D00000.TIF, US201901..."
5,20190211060,CYCLIC PEPTIDE ANALOGS AND CONJUGATES THEREOF,"Provided are cyclic peptide analogs, conjugate...",1. A compound of Formula (I): or a salt t...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1 is a 1...,"[US20190211060A1-20190711-D00001.TIF, US201902..."
6,20190201310,ORAL CARE COMPOSITION,An aqueous composition with a higher-than-neut...,1. An oral care composition useful for treati...,CROSS-REFERENCE TO RELATED APPLICATIONS This i...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1 is a 1...,"[US20190211060A1-20190711-D00001.TIF, US201902..."
7,20190194292,Enveloped Virus Resistant to Complement Inacti...,A recombinant fusion protein is disclosed. The...,1. A fusion protein comprising: (a) a CD55 pe...,REFERENCE TO SEQUENCE LISTING The Sequence Lis...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1. Mamma...,"[US20190194292A1-20190627-D00000.TIF, US201901..."
8,20190194323,ANTI-SIGLEC-7 ANTIBODIES FOR THE TREATMENT OF ...,The invention provides methods and composition...,1. A method of inhibiting proliferation of tu...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 provi...,"[US20190194323A1-20190627-D00000.TIF, US201901..."
9,20190153042,ANTIMICROBIAL PEPTIDES DERIVED FROM HEPATITIS ...,A pharmaceutical composition comprising: (a) a...,1. A pharmaceutical composition comprising: (...,CROSS REFERENCE TO RELATED APPLICATIONS This i...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 shows...,"[US20190153042A1-20190523-D00001.TIF, US201901..."


<hr>
<br>

<a id='section-2'></a>

### - Text Segmentation for Description Section

In [10]:
# Load dependencies
import pandas as pd
import re

In [89]:
def find_inbetween_text(text, sub1, sub2):
    """
    This function accepts a piece of text and extracts what's
    between any two given strings after finding their positions
    """
    
    # Get positions for the two strings
    pos1 = sub1
    pos2 = sub2
    
    if pos1 > pos2 and pos2 > 0:
        return text[pos2:pos1]
    elif pos2 > pos1 and pos1 > 0:
        return text[pos1:pos2]

In [90]:
# Try an example on first document description
# extract the background section text
boundary1 = re.search('([A-Z])*BACKGROUND(([A-Z])*(\s))*', patents_df['description'][0], re.MULTILINE)
boundary2 = re.search('([A-Z])*SUMMARY(([A-Z])*(\s))*', patents_df['description'][0], re.MULTILINE)

print(find_inbetween_text(patents_df['description'][0], boundary1.end(), boundary2.start()))

Cancer has a major impact on society in across the globe. In 2016, an estimated 1,685,210 new cases of cancer will be diagnosed in the United States alone, and 595,690 people will die from the disease. By 2020, 18.2 million Americans, roughly 1 in 19 people, will be cancer patients or cancer survivors, up from 11.7 million (1 in 26) in 2005, according to the Journal of Oncology Practice (Erikson 2007). Chimeric antigen receptors (CARs) are recombinant receptors for antigen, which, in a single molecule, redirect the specificity and function of T cells and other immune cells. Their use in cancer immunotherapy can be to rapidly generate tumor-targeted T cells, bypassing the obstacles of active immunization. Once expressed in cells, the CAR-modified cell may exert both immediate and long-term effects in a subject. Chimeric antigen receptor (CAR) T cell therapy, which edits a cancer patient's T cells to recognize their tumors, has shown to be effective for treating blood cancers. In recent 

In [13]:
background_boundary_start = 0
background_boundary_end = 0
invention_col = []

for index, row in patents_df.iterrows():
    background_boundary_start = re.search('([A-Z])*BACKGROUND(([A-Z])*(\s))*', \
                                          str(row['description']), re.MULTILINE)
    background_boundary_end = re.search('([A-Z])*SUMMARY(([A-Z])*(\s))*', \
                                        str(row['description']), re.MULTILINE)
    
    if background_boundary_start and background_boundary_end:
        invention_col.append(find_inbetween_text(str(row['description']), background_boundary_start.end(), \
                                             background_boundary_end.start()))
    else:
        invention_col.append(' ')
        
print(invention_col[2])

Procalcitonin (PCT) has become a well-established biomarker for sepsis diagnosis: PCT reflects the severity of bacterial infection and is in particular used to monitor progression of infection into sepsis, severe sepsis, or septic shock. PCT concentrations in sepsis, severe sepsis, or septic shock are typically above 1 ng/mL. It is possible to use PCT to measure the activity of the infection-associated systemic inflammatory response, to control success of therapy, and to estimate prognosis (Assicot M et al.: High serum procalcitonin concentrations in patients with sepsis and infection. Lancet 1993, 341:515-8; Clec'h C et al.: Diagnostic and prognostic value of procalcitonin in patients with septic shock. Crit Care Med 2004; 32:1166-9; Lee Y J et al.: Predictive comparisons of procalcitonin (PCT) level, arterial ketone body ratio (AKBR), APACHE III score and multiple organ dysfunction score (MODS) in systemic inflammatory response syndrome (SIRS), Yonsei Med J 2004, 45, 29-37; Meisner M

In [14]:
inventions_df = pd.DataFrame({'invention_background': invention_col})
inventions_df

Unnamed: 0,invention_background
0,Cancer has a major impact on society in across...
1,The rates of obesity and overweight have incre...
2,Procalcitonin (PCT) has become a well-establis...
3,
4,Human atrial natriuretic peptides (hANPs) are ...
5,Cancer is a serious and debilitating disease b...
6,"Microbes are found virtually everywhere, often..."
7,Oncolytic viruses have been tested as agents f...
8,"Siglec-7, also known as p75 or AIRM, is a memb..."
9,The increase of drug-resistant pathogens cause...


In [15]:
patents_df2 = pd.concat([patents_df, inventions_df], axis=1)
patents_df2

Unnamed: 0,id,invention_title,abstract,claims,description,drawings_description,drawings_file_paths,invention_background
0,20190151362,COMPOSITIONS AND METHODS OF CELLULAR IMMUNOTHE...,Disclosed herein are methods of treating a sub...,1. A method of treating a subject exhibiting ...,CROSS-REFERENCE This application is a continua...,BRIEF DESCRIPTION OF THE DRAWINGS The novel fe...,"[US20190151362A1-20190523-D00000.TIF, US201901...",Cancer has a major impact on society in across...
1,20190134132,NUTRITION BLEND FOR HEALTH BENEFITS IN ANIMALS,A method of minimizing fat accumulation in a g...,1. A method of minimizing fat accumulation in...,CROSS REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS The novel fe...,"[US20190151362A1-20190523-D00000.TIF, US201901...",The rates of obesity and overweight have incre...
2,20190224135,USE OF PROCALCITONIN (PCT) IN RISK STRATIFICAT...,Subject of the present invention are assays an...,1. An in vitro method for prognosis for a pat...,Subject of the present invention is the in vit...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 shows...,"[US20190224135A1-20190725-D00001.TIF, US201902...",Procalcitonin (PCT) has become a well-establis...
3,20190153419,THROMBIN-THROMBOMODULIN FUSION PROTEINS AS A P...,Compositions and methods for regulating the bl...,1. A thrombin-thrombomodulin fusion protein c...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS The patent o...,"[US20190153419A1-20190523-D00000.TIF, US201901...",
4,20190169293,HANP-FC-CONTAINING MOLECULAR CONJUGATE,The present invention provides a conjugate com...,1. A conjugate comprising a hANP peptide bond...,TECHNICAL FIELD The present invention relates ...,BRIEF DESCRIPTION OF DRAWINGS FIG. 1 schematic...,"[US20190169293A1-20190606-D00000.TIF, US201901...",Human atrial natriuretic peptides (hANPs) are ...
5,20190211060,CYCLIC PEPTIDE ANALOGS AND CONJUGATES THEREOF,"Provided are cyclic peptide analogs, conjugate...",1. A compound of Formula (I): or a salt t...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1 is a 1...,"[US20190211060A1-20190711-D00001.TIF, US201902...",Cancer is a serious and debilitating disease b...
6,20190201310,ORAL CARE COMPOSITION,An aqueous composition with a higher-than-neut...,1. An oral care composition useful for treati...,CROSS-REFERENCE TO RELATED APPLICATIONS This i...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1 is a 1...,"[US20190211060A1-20190711-D00001.TIF, US201902...","Microbes are found virtually everywhere, often..."
7,20190194292,Enveloped Virus Resistant to Complement Inacti...,A recombinant fusion protein is disclosed. The...,1. A fusion protein comprising: (a) a CD55 pe...,REFERENCE TO SEQUENCE LISTING The Sequence Lis...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1. Mamma...,"[US20190194292A1-20190627-D00000.TIF, US201901...",Oncolytic viruses have been tested as agents f...
8,20190194323,ANTI-SIGLEC-7 ANTIBODIES FOR THE TREATMENT OF ...,The invention provides methods and composition...,1. A method of inhibiting proliferation of tu...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 provi...,"[US20190194323A1-20190627-D00000.TIF, US201901...","Siglec-7, also known as p75 or AIRM, is a memb..."
9,20190153042,ANTIMICROBIAL PEPTIDES DERIVED FROM HEPATITIS ...,A pharmaceutical composition comprising: (a) a...,1. A pharmaceutical composition comprising: (...,CROSS REFERENCE TO RELATED APPLICATIONS This i...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 shows...,"[US20190153042A1-20190523-D00001.TIF, US201901...",The increase of drug-resistant pathogens cause...


In [16]:
# Repeat process for cross-reference section
ref_boundary_start = 0
ref_boundary_end = 0
ref_col = []

for index, row in patents_df2.iterrows():
    ref_boundary_start = re.search('CROSS-REFERENCE(([A-Z])*(\s))*', \
                                          str(row['description']), re.MULTILINE)
    ref_boundary_end = re.search('in their entirety', \
                                        str(row['description']), re.MULTILINE)
    
    if ref_boundary_start and ref_boundary_end:
        ref_col.append(find_inbetween_text(str(row['description']), ref_boundary_start.end(), \
                                             ref_boundary_end.start()))
    else:
        ref_col.append(' ')
        
print(ref_col[5])

This application claims priority to U.S. Provisional Application No. 62/383,330, filed Sep. 2, 2016, which is hereby incorporated by reference in its entirety. FIELD Provided herein are cyclic peptide analogs, pharmaceutical compositions comprising such compounds, and methods of treating cancer with such compounds. BACKGROUND Cancer is a serious and debilitating disease brought on by abnormal and unchecked cell division in a patient. Current treatment strategies include chemotherapy, radiation therapy, and surgery. These treatment options may be singular treatments or combined for a more effective regimen. Unfortunately, many patients do not respond well to current chemotherapeutic regimens or develop resistance after prolonged treatment. In addition, for many chemotherapeutics, there is a maximal lifetime level of drug that a patient may be administered. In this case, new drugs must be tried. Thus, there is a need for development of new and varied chemotherapeutic compounds to assist 

In [17]:
ref_df = pd.DataFrame({'cross_reference': ref_col})
ref_df

Unnamed: 0,cross_reference
0,
1,
2,
3,
4,
5,This application claims priority to U.S. Provi...
6,
7,
8,
9,


In [18]:
patents_df3 = pd.concat([patents_df2, ref_df], axis=1)
patents_df3

Unnamed: 0,id,invention_title,abstract,claims,description,drawings_description,drawings_file_paths,invention_background,cross_reference
0,20190151362,COMPOSITIONS AND METHODS OF CELLULAR IMMUNOTHE...,Disclosed herein are methods of treating a sub...,1. A method of treating a subject exhibiting ...,CROSS-REFERENCE This application is a continua...,BRIEF DESCRIPTION OF THE DRAWINGS The novel fe...,"[US20190151362A1-20190523-D00000.TIF, US201901...",Cancer has a major impact on society in across...,
1,20190134132,NUTRITION BLEND FOR HEALTH BENEFITS IN ANIMALS,A method of minimizing fat accumulation in a g...,1. A method of minimizing fat accumulation in...,CROSS REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS The novel fe...,"[US20190151362A1-20190523-D00000.TIF, US201901...",The rates of obesity and overweight have incre...,
2,20190224135,USE OF PROCALCITONIN (PCT) IN RISK STRATIFICAT...,Subject of the present invention are assays an...,1. An in vitro method for prognosis for a pat...,Subject of the present invention is the in vit...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 shows...,"[US20190224135A1-20190725-D00001.TIF, US201902...",Procalcitonin (PCT) has become a well-establis...,
3,20190153419,THROMBIN-THROMBOMODULIN FUSION PROTEINS AS A P...,Compositions and methods for regulating the bl...,1. A thrombin-thrombomodulin fusion protein c...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS The patent o...,"[US20190153419A1-20190523-D00000.TIF, US201901...",,
4,20190169293,HANP-FC-CONTAINING MOLECULAR CONJUGATE,The present invention provides a conjugate com...,1. A conjugate comprising a hANP peptide bond...,TECHNICAL FIELD The present invention relates ...,BRIEF DESCRIPTION OF DRAWINGS FIG. 1 schematic...,"[US20190169293A1-20190606-D00000.TIF, US201901...",Human atrial natriuretic peptides (hANPs) are ...,
5,20190211060,CYCLIC PEPTIDE ANALOGS AND CONJUGATES THEREOF,"Provided are cyclic peptide analogs, conjugate...",1. A compound of Formula (I): or a salt t...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1 is a 1...,"[US20190211060A1-20190711-D00001.TIF, US201902...",Cancer is a serious and debilitating disease b...,This application claims priority to U.S. Provi...
6,20190201310,ORAL CARE COMPOSITION,An aqueous composition with a higher-than-neut...,1. An oral care composition useful for treati...,CROSS-REFERENCE TO RELATED APPLICATIONS This i...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1 is a 1...,"[US20190211060A1-20190711-D00001.TIF, US201902...","Microbes are found virtually everywhere, often...",
7,20190194292,Enveloped Virus Resistant to Complement Inacti...,A recombinant fusion protein is disclosed. The...,1. A fusion protein comprising: (a) a CD55 pe...,REFERENCE TO SEQUENCE LISTING The Sequence Lis...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1. Mamma...,"[US20190194292A1-20190627-D00000.TIF, US201901...",Oncolytic viruses have been tested as agents f...,
8,20190194323,ANTI-SIGLEC-7 ANTIBODIES FOR THE TREATMENT OF ...,The invention provides methods and composition...,1. A method of inhibiting proliferation of tu...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 provi...,"[US20190194323A1-20190627-D00000.TIF, US201901...","Siglec-7, also known as p75 or AIRM, is a memb...",
9,20190153042,ANTIMICROBIAL PEPTIDES DERIVED FROM HEPATITIS ...,A pharmaceutical composition comprising: (a) a...,1. A pharmaceutical composition comprising: (...,CROSS REFERENCE TO RELATED APPLICATIONS This i...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 shows...,"[US20190153042A1-20190523-D00001.TIF, US201901...",The increase of drug-resistant pathogens cause...,


In [19]:
# Extract summary section
summary_boundary_start = 0
summary_boundary_end = 0
summary_col = []

for index, row in patents_df3.iterrows():
    summary_boundary_start = re.search('([A-Z])*SUMMARY(([A-Z])*(\s))*', \
                                          str(row['description']), re.MULTILINE)
    summary_boundary_end = re.search('([A-Z])*DESCRIPTION(([A-Z])*(\s))*', \
                                        str(row['description']), re.MULTILINE)
    
    if summary_boundary_start and summary_boundary_end:
        summary_col.append(find_inbetween_text(str(row['description']), summary_boundary_start.end(), \
                                             summary_boundary_end.start()))
    else:
        summary_col.append(' ')
        
print(summary_col[2])

The present invention is based on the surprising finding that in samples of patients with a primary, non-infectious disease, slightly elevated procalcitonin (PCT) levels (concentrations) have been detected at a large frequency and are of diagnostic relevance. Remarkably, the inventors have identified a large number of samples having serum levels above 0.03 ng/mL (26.0%) and 0.05 ng/mL (14.7%), respectively, from a total of 4997 samples of patients having a primary, non-infectious disease. Slightly elevated PCT levels relate to PCT levels in the range of from about 0.02 to 0.25 ng/mL, preferably between about 0.02 and 0.1 ng/mL. The presence of slightly elevated PCT levels may be indicative for the risk of a patient having a non-infectious primary disease to acquire a yet clinically unmanifested and/or yet asymptomatic further disease or medical condition. Such a further disease or medical condition may be related to a local infection or the local infection may facilitate, accelerate an

In [20]:
summary_df = pd.DataFrame({'summary': summary_col})
summary_df

Unnamed: 0,summary
0,There exists a pressing need for alternative a...
1,The present disclosure relates generally to ke...
2,The present invention is based on the surprisi...
3,
4,Technical Problem An object of the present inv...
5,"In one aspect, provided is a compound of Formu..."
6,The present invention is directed to compositi...
7,This invention provides a recombinant fusion p...
8,"The disclosure is based, in part, on the disco..."
9,"In one aspect, the invention relates to a phar..."


In [21]:
patents_df4 = pd.concat([patents_df3, summary_df], axis=1)
patents_df4

Unnamed: 0,id,invention_title,abstract,claims,description,drawings_description,drawings_file_paths,invention_background,cross_reference,summary
0,20190151362,COMPOSITIONS AND METHODS OF CELLULAR IMMUNOTHE...,Disclosed herein are methods of treating a sub...,1. A method of treating a subject exhibiting ...,CROSS-REFERENCE This application is a continua...,BRIEF DESCRIPTION OF THE DRAWINGS The novel fe...,"[US20190151362A1-20190523-D00000.TIF, US201901...",Cancer has a major impact on society in across...,,There exists a pressing need for alternative a...
1,20190134132,NUTRITION BLEND FOR HEALTH BENEFITS IN ANIMALS,A method of minimizing fat accumulation in a g...,1. A method of minimizing fat accumulation in...,CROSS REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS The novel fe...,"[US20190151362A1-20190523-D00000.TIF, US201901...",The rates of obesity and overweight have incre...,,The present disclosure relates generally to ke...
2,20190224135,USE OF PROCALCITONIN (PCT) IN RISK STRATIFICAT...,Subject of the present invention are assays an...,1. An in vitro method for prognosis for a pat...,Subject of the present invention is the in vit...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 shows...,"[US20190224135A1-20190725-D00001.TIF, US201902...",Procalcitonin (PCT) has become a well-establis...,,The present invention is based on the surprisi...
3,20190153419,THROMBIN-THROMBOMODULIN FUSION PROTEINS AS A P...,Compositions and methods for regulating the bl...,1. A thrombin-thrombomodulin fusion protein c...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS The patent o...,"[US20190153419A1-20190523-D00000.TIF, US201901...",,,
4,20190169293,HANP-FC-CONTAINING MOLECULAR CONJUGATE,The present invention provides a conjugate com...,1. A conjugate comprising a hANP peptide bond...,TECHNICAL FIELD The present invention relates ...,BRIEF DESCRIPTION OF DRAWINGS FIG. 1 schematic...,"[US20190169293A1-20190606-D00000.TIF, US201901...",Human atrial natriuretic peptides (hANPs) are ...,,Technical Problem An object of the present inv...
5,20190211060,CYCLIC PEPTIDE ANALOGS AND CONJUGATES THEREOF,"Provided are cyclic peptide analogs, conjugate...",1. A compound of Formula (I): or a salt t...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1 is a 1...,"[US20190211060A1-20190711-D00001.TIF, US201902...",Cancer is a serious and debilitating disease b...,This application claims priority to U.S. Provi...,"In one aspect, provided is a compound of Formu..."
6,20190201310,ORAL CARE COMPOSITION,An aqueous composition with a higher-than-neut...,1. An oral care composition useful for treati...,CROSS-REFERENCE TO RELATED APPLICATIONS This i...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1 is a 1...,"[US20190211060A1-20190711-D00001.TIF, US201902...","Microbes are found virtually everywhere, often...",,The present invention is directed to compositi...
7,20190194292,Enveloped Virus Resistant to Complement Inacti...,A recombinant fusion protein is disclosed. The...,1. A fusion protein comprising: (a) a CD55 pe...,REFERENCE TO SEQUENCE LISTING The Sequence Lis...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1. Mamma...,"[US20190194292A1-20190627-D00000.TIF, US201901...",Oncolytic viruses have been tested as agents f...,,This invention provides a recombinant fusion p...
8,20190194323,ANTI-SIGLEC-7 ANTIBODIES FOR THE TREATMENT OF ...,The invention provides methods and composition...,1. A method of inhibiting proliferation of tu...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 provi...,"[US20190194323A1-20190627-D00000.TIF, US201901...","Siglec-7, also known as p75 or AIRM, is a memb...",,"The disclosure is based, in part, on the disco..."
9,20190153042,ANTIMICROBIAL PEPTIDES DERIVED FROM HEPATITIS ...,A pharmaceutical composition comprising: (a) a...,1. A pharmaceutical composition comprising: (...,CROSS REFERENCE TO RELATED APPLICATIONS This i...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 shows...,"[US20190153042A1-20190523-D00001.TIF, US201901...",The increase of drug-resistant pathogens cause...,,"In one aspect, the invention relates to a phar..."


In [22]:
# Extract detailed description section
desc_boundary_start = 0
desc_boundary_end = 0
desc_col = []

for index, row in patents_df4.iterrows():
    desc_boundary_start = re.search('([A-Z])*DETAILED DESCRIPTION(([A-Z])*(\s))*', \
                                          str(row['description']), re.MULTILINE)
    desc_boundary_end = len(str(row['description']))
    
    if desc_boundary_start:
        desc_col.append(find_inbetween_text(str(row['description']), desc_boundary_start.end(), \
                                             desc_boundary_end))
    else:
        desc_col.append(' ')

print(desc_col[0])

The following description and examples illustrate embodiments of the disclosure in detail. It is to be understood that this disclosure is not limited to the particular embodiments described herein and as such can vary. Those of skill in the art will recognize that there are numerous variations and modifications of the disclosure, which are encompassed within its scope. Unless otherwise indicated, any embodiment can be combined with any other embodiment. As used herein, unless otherwise indicated, some inventive embodiments herein contemplate numerical ranges. A variety of aspects of this invention can be presented in a range format. It should be understood that the description in range format is merely for convenience and brevity and should not be construed as an inflexible limitation on the scope of the invention. Accordingly, the description of a range should be considered to have specifically disclosed all the possible subranges as well as individual numerical values within that ran

In [23]:
detailed_desc_df = pd.DataFrame({'detailed_description': desc_col})
detailed_desc_df

Unnamed: 0,detailed_description
0,The following description and examples illustr...
1,Definitions Some definitions are provided here...
2,The present invention relates to an in vitro m...
3,"Unless defined otherwise, all technical and sc..."
4,
5,"Definitions As used herein, reference to “abou..."
6,Useful basic (caustic) liquid compositions dis...
7,In accordance with the fusion protein of this ...
8,"Terminology As used in herein, the singular fo..."
9,The present invention is more particularly des...


In [24]:
patents_df5 = pd.concat([patents_df4, detailed_desc_df], axis=1)
patents_df5

Unnamed: 0,id,invention_title,abstract,claims,description,drawings_description,drawings_file_paths,invention_background,cross_reference,summary,detailed_description
0,20190151362,COMPOSITIONS AND METHODS OF CELLULAR IMMUNOTHE...,Disclosed herein are methods of treating a sub...,1. A method of treating a subject exhibiting ...,CROSS-REFERENCE This application is a continua...,BRIEF DESCRIPTION OF THE DRAWINGS The novel fe...,"[US20190151362A1-20190523-D00000.TIF, US201901...",Cancer has a major impact on society in across...,,There exists a pressing need for alternative a...,The following description and examples illustr...
1,20190134132,NUTRITION BLEND FOR HEALTH BENEFITS IN ANIMALS,A method of minimizing fat accumulation in a g...,1. A method of minimizing fat accumulation in...,CROSS REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS The novel fe...,"[US20190151362A1-20190523-D00000.TIF, US201901...",The rates of obesity and overweight have incre...,,The present disclosure relates generally to ke...,Definitions Some definitions are provided here...
2,20190224135,USE OF PROCALCITONIN (PCT) IN RISK STRATIFICAT...,Subject of the present invention are assays an...,1. An in vitro method for prognosis for a pat...,Subject of the present invention is the in vit...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 shows...,"[US20190224135A1-20190725-D00001.TIF, US201902...",Procalcitonin (PCT) has become a well-establis...,,The present invention is based on the surprisi...,The present invention relates to an in vitro m...
3,20190153419,THROMBIN-THROMBOMODULIN FUSION PROTEINS AS A P...,Compositions and methods for regulating the bl...,1. A thrombin-thrombomodulin fusion protein c...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS The patent o...,"[US20190153419A1-20190523-D00000.TIF, US201901...",,,,"Unless defined otherwise, all technical and sc..."
4,20190169293,HANP-FC-CONTAINING MOLECULAR CONJUGATE,The present invention provides a conjugate com...,1. A conjugate comprising a hANP peptide bond...,TECHNICAL FIELD The present invention relates ...,BRIEF DESCRIPTION OF DRAWINGS FIG. 1 schematic...,"[US20190169293A1-20190606-D00000.TIF, US201901...",Human atrial natriuretic peptides (hANPs) are ...,,Technical Problem An object of the present inv...,
5,20190211060,CYCLIC PEPTIDE ANALOGS AND CONJUGATES THEREOF,"Provided are cyclic peptide analogs, conjugate...",1. A compound of Formula (I): or a salt t...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1 is a 1...,"[US20190211060A1-20190711-D00001.TIF, US201902...",Cancer is a serious and debilitating disease b...,This application claims priority to U.S. Provi...,"In one aspect, provided is a compound of Formu...","Definitions As used herein, reference to “abou..."
6,20190201310,ORAL CARE COMPOSITION,An aqueous composition with a higher-than-neut...,1. An oral care composition useful for treati...,CROSS-REFERENCE TO RELATED APPLICATIONS This i...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1 is a 1...,"[US20190211060A1-20190711-D00001.TIF, US201902...","Microbes are found virtually everywhere, often...",,The present invention is directed to compositi...,Useful basic (caustic) liquid compositions dis...
7,20190194292,Enveloped Virus Resistant to Complement Inacti...,A recombinant fusion protein is disclosed. The...,1. A fusion protein comprising: (a) a CD55 pe...,REFERENCE TO SEQUENCE LISTING The Sequence Lis...,BRIEF DESCRIPTION OF THE FIGURES FIG. 1. Mamma...,"[US20190194292A1-20190627-D00000.TIF, US201901...",Oncolytic viruses have been tested as agents f...,,This invention provides a recombinant fusion p...,In accordance with the fusion protein of this ...
8,20190194323,ANTI-SIGLEC-7 ANTIBODIES FOR THE TREATMENT OF ...,The invention provides methods and composition...,1. A method of inhibiting proliferation of tu...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 provi...,"[US20190194323A1-20190627-D00000.TIF, US201901...","Siglec-7, also known as p75 or AIRM, is a memb...",,"The disclosure is based, in part, on the disco...","Terminology As used in herein, the singular fo..."
9,20190153042,ANTIMICROBIAL PEPTIDES DERIVED FROM HEPATITIS ...,A pharmaceutical composition comprising: (a) a...,1. A pharmaceutical composition comprising: (...,CROSS REFERENCE TO RELATED APPLICATIONS This i...,BRIEF DESCRIPTION OF THE DRAWINGS FIG. 1 shows...,"[US20190153042A1-20190523-D00001.TIF, US201901...",The increase of drug-resistant pathogens cause...,,"In one aspect, the invention relates to a phar...",The present invention is more particularly des...


<hr>
<br>

<a id='section-3'></a>

### - Append information about Chemical Structures in the patents

In [1]:
# Loading dependencies
import subprocess
import glob
import pandas as pd
import os

In [103]:
# Extract SMILES notation

# File counter
i = 0

# Loop through all folders and grab .tif image files
for folder in glob.glob('../Dataset/*'):
    
    if ([folder[13:-9] == item for item in patents_df['id']]):
        print('Recognizing structures in file', folder[11:])
        
        # Taking a subgroup of only 20 files for experimentation purposes
        if i <= 20:
    
            # Select only main tiff file (folder[11:]) and ignore supplementary ones
            for _file in glob.glob(folder + '/*.TIF'):

                    # Check if folder with the current document name exists
                    if not os.path.exists('../temp/chemical-names-smiles/' + folder[11:]):

                        # If folder does not exist, create it
                        os.makedirs('../temp/chemical-names-smiles/' + folder[11:])

                    subprocess.check_call(['osra', _file, \
                                           '-w ../temp/chemical-names-smiles/' + str(_file[11:-4]) + '-' + str(i) + '.txt'])
        else:
            break
        
    i += 1

Recognizing structures in file US20190151362A1-20190523
Recognizing structures in file US20190134132A1-20190509
Recognizing structures in file US20190224135A1-20190725
Recognizing structures in file US20190153419A1-20190523


CalledProcessError: Command '['osra', '../Dataset/US20190153419A1-20190523/US20190153419A1-20190523-D00000.TIF', '-w ../temp/chemical-names-smiles/US20190153419A1-20190523/US20190153419A1-20190523-D00000-3.txt']' died with <Signals.SIGABRT: 6>.

In [7]:
# Sample output
with open('../temp/chemical-names-smiles/US20190126075A1-20190502/US20190126075A1-20190502-D00003-17.txt', 'rb') as f:
    print(f.read())

b'N*SSC[C@H](CC(C)C)C\n'


In [99]:
# File counter
j = 0

smiles_col = []

# Loop through all folders and grab .tif image files
for folder in glob.glob('../temp/chemical-names-smiles/*'):
    
    print('Reading SMILES in file', folder[30:])
    smiles_for_one = []
    
    # Select only main tiff file (folder[11:]) and ignore supplementary ones
    for _file in glob.glob(folder + '/*.txt'):

        # Taking a subgroup of only 20 files for experimentation purposes
        if j <= 20:

            # Read each file and append each line that represents a compound
            # to an array
            one = ''

            with open(_file, 'r') as smiles_file:
                one = smiles_file.read()

            if one != '':
                smiles_for_one += one.split()

        else:
            break
            
    # Append to a global array
    if len(smiles_for_one) == 0 or smiles_for_one == None:
        smiles_col.append([''])
    else:
        smiles_col.append(smiles_for_one)
    
    j += 1

Reading SMILES in file US20190151362A1-20190523
Reading SMILES in file US20190224135A1-20190725
Reading SMILES in file US20190153419A1-20190523
Reading SMILES in file US20190169293A1-20190606
Reading SMILES in file US20190211060A1-20190711
Reading SMILES in file US20190194292A1-20190627
Reading SMILES in file US20190194323A1-20190627
Reading SMILES in file US20190153042A1-20190523
Reading SMILES in file US20190161517A1-20190530
Reading SMILES in file US20190224268A1-20190725
Reading SMILES in file US20190136205A1-20190509
Reading SMILES in file US20190175555A1-20190613
Reading SMILES in file US20190194262A1-20190627
Reading SMILES in file US20190151335A1-20190523
Reading SMILES in file US20190160016A1-20190530
Reading SMILES in file US20190126075A1-20190502
Reading SMILES in file US20190133908A1-20190509
Reading SMILES in file US20190201323A1-20190704


In [57]:
smiles_df = pd.DataFrame({'chemical_compounds_smiles': smiles_col})
smiles_df

Unnamed: 0,chemical_compounds_smiles
0,"[C=C1CC/[C]/1=C\1/CC2[C@@H](C1)C(C2)*C1*#CCC1,..."
1,"[CC1CC2C1C1*C(C3C2C13)C, C*(NCCOCC*(OCCn1nnc2c..."
2,[C/C=C(/[C@H]1OC(=O)C(C)(C)NC(=O)C(NC(=O)CN(C)...
3,"[CC****C(C(C(C(C)*)*)*)N, CCC[C@H](C***[C+]1C$..."
4,"[*CC(C1CC(CC1(C)C1(I)CCC(C1)(C)C)(*)C)O, CCC1(..."
5,"[CC(C(=O)CCCCCN1C(=O)CC(C1=O)C(C)C)C, C*SCC(=O..."
6,"[CCC(C1C/C/1=C/C/C=C/1\[C@H]2C1CCC2)O, C1CC23C..."
7,"[*=NN(/*=N\I)C[C@@H](c1ccccc1)OC(=O)N, C1CC2C(..."
8,"[CC1CC*C1, *O[C]1(=*)CCC2C1(C)CC(c1ccc3c(c1)CC..."
9,[N*SSC[C@H](CC(C)C)C]


In [100]:
print(len(smiles_col))

18


<hr>
<br>

<a id='section-4'></a>

### - Text Preprocessing

In [None]:
def preprocessText(text):
    """
    This function operates on any piece of text and applies
    transformation so it is "clean" and ready for the next step
    in the NLP pipeline
    """

    # Remove unwanted characters
    unwanted_chars = set(["@", "+", '/', "'", '"', '\\', '', '\\n', '\n',
                          '?', '#', '%', '$', '&', ';', '!', ';', ':', "*", "_", "="])
    
    for char in unwanted_chars:
        text = text.replace(char, '')

    # Convert all text into lowercase
    text = text.lower()

    return text