In [36]:
import requests
import json
import pandas as pd
import os
import openai

In [3]:
with open("/Users/julia_patsiukova/Downloads/api_keys/api_lens_org.txt", "r") as file:
    lens_api_key = file.read().strip()

os.environ["LENS_API_KEY"] = lens_api_key

url = 'https://api.lens.org/patent/search'

In [6]:
query = {
    "query": {
        "bool": {
            "must": [
                {
                    "query_string": {
                        "query": "\"ASO\" OR \"antisense oligonucleotides\"",
                        "fields": ["title", "abstract", "claims"],
                        "default_operator": "OR"
                    }
                }
            ]
        }
    },
    "size": 13
}

headers = {
    'Authorization': f'Bearer {lens_api_key}',
    'Content-Type': 'application/json'
}

response = requests.post(url, headers=headers, data=json.dumps(query))

if response.status_code == 200:
    data = response.json()
else:
    print(f"Error: {response.status_code} - {response.text}")

In [21]:
# the json structure of the outcome
def print_json_structure(data, indent=0):
    if isinstance(data, dict):
        for key in data:
            print('  ' * indent + str(key))
            print_json_structure(data[key], indent + 1)
    elif isinstance(data, list) and len(data) > 0:
        print('  ' * indent + '[Array of {}]'.format(type(data[0]).__name__))
        print_json_structure(data[0], indent + 1)
    else:
        print('  ' * indent + str(type(data).__name__))

if response.status_code == 200:
    data = response.json()
    print_json_structure(data)
else:
    print(f"Error: {response.status_code} - {response.text}")

total
  int
max_score
  float
data
  [Array of dict]
    lens_id
      str
    jurisdiction
      str
    doc_number
      str
    kind
      str
    date_published
      str
    doc_key
      str
    docdb_id
      int
    lang
      str
    biblio
      publication_reference
        jurisdiction
          str
        doc_number
          str
        kind
          str
        date
          str
      application_reference
        jurisdiction
          str
        doc_number
          str
        kind
          str
        date
          str
      priority_claims
        claims
          [Array of dict]
            jurisdiction
              str
            doc_number
              str
            kind
              str
            date
              str
            sequence
              int
        earliest_claim
          date
            str
      invention_title
        [Array of dict]
          text
            str
          lang
            str
      parties
        applicants

In [8]:
data_list = []

for patent in data.get('data', []):
    lens_id = patent.get('lens_id', 'N/A')

    title = patent.get('biblio', {}).get('invention_title', [{}])[0].get('text', 'N/A')

    abstract = patent.get('abstract', [{}])[0].get('text', 'N/A')

    claims = " ".join(patent.get('claims', [{}])[0].get('claims', [{}])[0].get('claim_text', ["N/A"]))

    description = patent.get('description', {}).get('text', 'N/A')

    data_list.append({
        'Lens ID': lens_id,
        'Title': title,
        'Abstract': abstract,
        'Claims': claims,
        'Description': description
    })

df = pd.DataFrame(data_list)

In [12]:
len(df['Description'][3])

9530

In [9]:
# example of the created df
df

Unnamed: 0,Lens ID,Title,Abstract,Claims,Description
0,006-252-632-930-02X,Antisense oligonucleotides targeting non-codin...,The invention discloses antisense oligonucleot...,,
1,035-902-563-142-209,Antisense oligonucleotide (ASO) gene inhibitio...,Embodiments of the present invention relate ge...,,
2,073-575-549-953-147,ANTISENSE OLIGONUCLEOTIDE (ASO) GENE INHIBITIO...,Embodiments of the invention related generally...,,
3,088-236-812-481-174,ANTISENSE OLIGONUCLEOTIDE (ASO) GENE INHIBITIO...,Embodiments of the invention related generally...,CLAIMS 1. A method of treating a patient diagn...,Antisense Oligonucleotide (ASO) Gene Inhibitio...
4,093-656-511-507-299,ANTISENSE OLIGONUCLEOTIDES,Provided herein include conditionally activata...,"WHAT IS CLAIMED IS: 1. An oligonucleotide, com...",ANTISENSE OLIGONUCLEOTIDES CROSS-REFERENCE TO ...
5,103-752-500-267-462,Antisense oligonucleotides and uses thereof,Disclosed herein are novel single strand antis...,,
6,114-832-383-274-557,ANTISENSE OLIGONUCLEOTIDE (ASO) GENE INHIBITIO...,Embodiments of the invention related generally...,,
7,117-058-033-153-332,ANTISENSE OLIGONUCLEOTIDE (ASO) GENE INHIBITIO...,Embodiments of the invention related generally...,,
8,148-957-118-252-691,Antisense Oligonucleotide (ASO) Gene Inhibitio...,Embodiments of the invention related generally...,1 . A method of treating a patient diagnosed w...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...
9,156-638-061-645-268,INHIBICION GENICA Y TRATAMIENTO MEDIANTE SECUE...,Embodiments of the invention related generally...,,


### GenAI

In [32]:
import openai
with open("/Users/julia_patsiukova/Downloads/api_keys/openai_api_key.txt", "r") as file:
    openai_api_key = file.read().strip()

os.environ["OPENAI_API_KEY"] = openai_api_key
MODEL = "gpt-4o-mini"

columns_to_add = [
    'target_gene', 'target_exon', 'target_variant', 'cell_line',
    'transfection_approach', 'aso_type', 'oligonucleotide_sequence', 'efficiency'
]
for col in columns_to_add:
    df[col] = None

for index, row in df.iterrows():
    description = row['Description']

    if not description.strip():
        for col in columns_to_add:
            df.at[index, col] = 'No text'
        continue

    try:
        completion = openai.chat.completions.create(
            model=MODEL,
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert bioinformatician who is capable of analyzing scientific literature about antisense oligonucleotides."
                },
                {
                    "role": "user",
                    "content": f"""Read this text and retrieve the information about if any:
                                - name of the target gene (sequence)
                                - name of the target exon
                                - name of the target variant
                                - cell line
                                - transfection approach
                                - ASO type
                                - oligonucleotide sequence
                                - efficiency

                                If there is no text to analyze put 'No text'. If there is a text but no information about some bullet points put 'N/A' in corresponding column.

                                Text: {description}
                                """
                },
            ]
        )

        completion_json = json.loads(completion.to_json())

        response = completion_json["choices"][0]["message"]["content"]

        lines = response.split("\n")
        header = lines[0].split("|")[1:-1]
        data_lines = lines[2:]

        if data_lines:
            first_row = data_lines[0].split("|")[1:-1]
            first_row = [cell.strip() for cell in first_row]
            for col, value in zip(columns_to_add, first_row):
                df.at[index, col] = value
        else:
            for col in columns_to_add:
                df.at[index, col] = 'N/A'

    except Exception as e:
        print(f"Error processing row with Lens ID {row['Lens ID']}: {e}")
        for col in columns_to_add:
            df.at[index, col] = 'Error'

In [33]:
df

Unnamed: 0,Lens ID,Title,Abstract,Claims,Description,target_gene,target_exon,target_variant,cell_line,transfection_approach,aso_type,oligonucleotide_sequence,efficiency
0,006-252-632-930-02X,Antisense oligonucleotides targeting non-codin...,The invention discloses antisense oligonucleot...,,,,,,,,,,
1,035-902-563-142-209,Antisense oligonucleotide (ASO) gene inhibitio...,Embodiments of the present invention relate ge...,,,,,,,,,,
2,073-575-549-953-147,ANTISENSE OLIGONUCLEOTIDE (ASO) GENE INHIBITIO...,Embodiments of the invention related generally...,,,,,,,,,,
3,088-236-812-481-174,ANTISENSE OLIGONUCLEOTIDE (ASO) GENE INHIBITIO...,Embodiments of the invention related generally...,CLAIMS 1. A method of treating a patient diagn...,Antisense Oligonucleotide (ASO) Gene Inhibitio...,JAK2,,V617F,HEL,,Phosphorothioate MOE backbone,5’-UCUCCAGAUUAUGAACUAU-3’ (SEQ ID NO 3),Approximately 50% decrease in JAK2 protein lev...
4,093-656-511-507-299,ANTISENSE OLIGONUCLEOTIDES,Provided herein include conditionally activata...,"WHAT IS CLAIMED IS: 1. An oligonucleotide, com...",ANTISENSE OLIGONUCLEOTIDES CROSS-REFERENCE TO ...,MAPT (microtubule-associated protein tau),,"SEQ ID NO: 169, SEQ ID NO: 170",HCT116 colorectal carcinoma,Lipofectamine 2000,Stem-loop,SEQ ID NO: 162 (ATTTCCAAATTCACTTTTAC),21.5 nM IC50 (Seq1-Control 1)
5,103-752-500-267-462,Antisense oligonucleotides and uses thereof,Disclosed herein are novel single strand antis...,,,,,,,,,,
6,114-832-383-274-557,ANTISENSE OLIGONUCLEOTIDE (ASO) GENE INHIBITIO...,Embodiments of the invention related generally...,,,,,,,,,,
7,117-058-033-153-332,ANTISENSE OLIGONUCLEOTIDE (ASO) GENE INHIBITIO...,Embodiments of the invention related generally...,,,,,,,,,,
8,148-957-118-252-691,Antisense Oligonucleotide (ASO) Gene Inhibitio...,Embodiments of the invention related generally...,1 . A method of treating a patient diagnosed w...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...,JAK2,,V617F,SET-2,Incubation with ASO,ASO-T-JAK2,SEQ ID NO 4: 5′-CAAAGAAAGACUAAGGAAA-3′<br>SEQ ...,"Significant reduction in RLUs (average < 2,000..."
9,156-638-061-645-268,INHIBICION GENICA Y TRATAMIENTO MEDIANTE SECUE...,Embodiments of the invention related generally...,,,,,,,,,,


In [35]:
print(completion.choices[0].message.content)

| Target Gene | Target Exon | Target Variant | Cell Line      | Transfection Approach | ASO Type      | Oligonucleotide Sequence                      | Efficiency |
|-------------|-------------|----------------|-----------------|-----------------------|----------------|-----------------------------------------------|------------|
| JAK2        | N/A         | V617F          | SET-2, HEL, CMK | N/A                   | ASO-T-JAK2     | 5’-CAAAGAAAGACUAAGGA-3’ (SEQ ID NO 4)       | Significant reduction in cell viability       |
|             |             |                |                 |                       |                | 5’-CA A AGA A AGACU A AGGA A A-3’ (SEQ ID NO 5)|            |
| IGHMBP2     | N/A         | C31401 A       | Fibroblast      | N/A                   | ASO-T-IGHMBP2  | UCUUCCCCCUGUGGAAGUG (SEQ ID NO 8)           | N/A        |
