Code of notebook [NCBI-scraper-simple](https://github.com/Lady3mlnm/NCBI-scraper-simple/blob/main/NCBI-scraping.ipynb)<br />
query to get variations for gene: [https://www.ncbi.nlm.nih.gov/clinvar/](https://www.ncbi.nlm.nih.gov/clinvar/), insert "insensitivity to pain SCN9A", download as "UI List".

In [1]:
import requests
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd

In [2]:
pd.options.display.max_colwidth = 60

In [3]:
# read from file list of vatiations
with open('input/UI List -test.txt', 'r') as fh:
    ls_variations = [el.strip() for el in fh.readlines()]
    
print('Number of variations to process:', len(ls_variations))
print(ls_variations[:5])

Number of variations to process: 4
['530654', '331895', '471143', '892607']


<br />
<br />

In [4]:
def handle_rsid(rsid, url_rsid, variation, url_variation):
    """
    The function gets 1 rsid (as string or integer), mades query in usual way to NCBI server,
    extracts some data from the code and packs them to Pandas Series, that is returned from the function.
    The functin is not ideal in terms of defence against server error, and sometimes can break down.
    But if server response is good, it can process a thousand of rsid-s.    
    """
    
    # get html-code of the page
    # url = 'https://www.ncbi.nlm.nih.gov/snp/' + str(rsid)
    
    # Made response to the server. If there is a problem, try again.
    response = requests.get(url_rsid)
    while response.status_code != 200:
        print(f'WARNING: rsid = {rsid}, response_code = {response.status_code}')
        sleep(3)
        response = requests.get(url)
    
    # parse response-object to BeautifulSoup-object
    data = response.text
    soup = BeautifulSoup(data, 'html.parser')
    
    # preliminaty extract from html-code part of code with necessary values by specified tags and their attributes
    ls_dl = soup.find('div', attrs={'class': 'summary-box usa-grid-full'}).find_all('dl', attrs={'class': 'usa-width-one-half'})

    # extract some values and place them into variable (left half of data block on the page)
    ls_dt = ls_dl[0].find_all('dt')
    ls_dd = ls_dl[0].find_all('dd')
    for el_dt, el_dd in zip(ls_dt, ls_dd):
        el_dt = el_dt.text.strip()
        if el_dt == 'Organism':
            organism = el_dd.text.strip()
        elif el_dt == 'Position':
            ls_span = el_dd.find_all('span')
            pos = ls_span[0].text.strip()
            ref_genome = ls_span[1].text.strip(" ( )")
        elif el_dt == 'Alleles':
            alleles = el_dd.text.strip()
            ls_allele_pairs = alleles.split(' / ')
            ref_allele = ''
            alt_alleles = ''
            for pair in ls_allele_pairs:
                if len(pair) != 3:
                    print(f"WARNING: rsid = {rsid}: problem with splitting of allele string, length of pair is {len(pair)} instead of expected 3")
                    print(f"Alleles: {pair[:30]}{'…' if len(pair)>30 else ''}")
                    ref_allele = '-'
                    alt_alleles = '-'
                else:
                    try:
                        al_1, al_2 = pair.split('>')
                    except ValueError as ex:
                        print(f'CAUGHT ERROR: rsid = {rsid}: {ex}')
                        ref_allele = '-'
                        alt_alleles = '-'
                        break
                    else:
                        if ref_allele == '':
                            ref_allele = al_1
                        elif al_1 != ref_allele:
                            ref_allele += ', ' + al_1
                            print(f'For rsid = {rsid}, there are more than one reference allele')
                            # raise ValueError(f'For rsid = {rsid}, there is more than one reference allele')

                        if alt_alleles == '':
                            alt_alleles = al_2
                        else:
                            alt_alleles += ', ' + al_2

        elif el_dt == 'Variation Type':
            variation_type = el_dd.text.lstrip().split('\n')[0]
    
    # extract some values and place them into variable (right half of data block on the page)
    ls_dt = ls_dl[1].find_all('dt')
    ls_dd = ls_dl[1].find_all('dd')
    for el_dt, el_dd in zip(ls_dt, ls_dd):
        el_dt = el_dt.text.strip()
        if el_dt == 'Clinical Significance':
            clinical_significance = el_dd.text.strip()
        elif el_dt == 'Gene : Consequence':
            gene_consequence = '; '.join(map(lambda tag:tag.text.strip(), el_dd.findAll() ) )    #el_dd.text.strip()
        elif el_dt == 'Publications':
            publications = ' '.join(st.strip() for st in el_dd.text.strip().split('\n'))

    # pack variables into Pandas series
    ser = pd.Series({
        'rsid': rsid,
        'organism': organism,
        'position': pos[3:] if pos.startswith('chr') else pos,
        'ref_genome': ref_genome,
        'ref_allele': ref_allele,
        'alt_alleles': alt_alleles,
        'variation_type': variation_type,
        'clinical_significance': clinical_significance,
        'gene_consequence': gene_consequence,
        'publications': publications,
        'url_rsid': url_rsid,
        'variation': variation,
        'url_variation': url_variation
    })
    
    return ser

In [5]:
def handle_variation(variation:str):
    url_variation = 'https://www.ncbi.nlm.nih.gov/clinvar/variation/' + variation + '/'
    
    # Made response to the server. If there is a problem, try again.
    response = requests.get(url_variation)
    while response.status_code != 200:
        print(f'WARNING: rsid = {rsid}, response_code = {response.status_code}')
        sleep(3)
        response = requests.get(url)

    # parse response-object to BeautifulSoup-object
    data = response.text
    soup = BeautifulSoup(data, 'html.parser')
    
    # extract element that is starting point for further searching
    el = soup.find('dt', attrs={'class': 'hashelp'})
    
    # explore next elements to find rsid, if it present in the code
    while True:
        el = el.find_next_sibling()
        if not el:
            rsid = None
            url_rsid = None
            break
        text = el.text.strip()
        if text.startswith('dbSNP: rs'):
            rsid = text[7:]
            url_rsid = el.find('a')['href']
            # print(url_rsid)
            break

    if rsid:
        return handle_rsid(rsid, url_rsid, variation, url_variation)
    else:
        return None
    
# ser = handle_variation(ls_variations[1])
# ser

In [6]:
df = pd.DataFrame()

# print header for preview inside this Jupyter notebook
print("  #   variation rsid         ref_genome ref_allele   alt_alleles  gene_consequence")

for i, variation in enumerate(ls_variations[:], start=1):
    ser = handle_variation(variation)
    if ser is not None:
        print(f"{i:3}   {variation:7}  {ser.rsid:12}  {ser.ref_genome:10}     {ser.ref_allele:3}          {ser.alt_alleles:7}  {ser.gene_consequence}")
        ser.name = i
        df = df.append(ser)
    else:
        print(f"{i:3}     —")
    
    sleep(3)

df

  #   variation rsid         ref_genome ref_allele   alt_alleles  gene_consequence
  1     —
  2   331895   rs186838828   GRCh38.p13     T            A        SCN1A-AS1 : Intron Variant; SCN9A : 3 Prime UTR Variant
Alleles: delT
  3   471143   rs1553473041  GRCh38.p13     -            -        SCN9A : Frameshift Variant; SCN1A-AS1 : Intron Variant
  4   892607   rs372210358   GRCh38.p13     C            G        SCN9A : Synonymous Variant; SCN1A-AS1 : Intron Variant


Unnamed: 0,rsid,organism,position,ref_genome,ref_allele,alt_alleles,variation_type,clinical_significance,gene_consequence,publications,url_rsid,variation,url_variation
2,rs186838828,Homo sapiens,2:166195246,GRCh38.p13,T,A,SNV,Reported in ClinVar,SCN1A-AS1 : Intron Variant; SCN9A : 3 Prime UTR Variant,0 citations,https://www.ncbi.nlm.nih.gov/snp/rs186838828,331895,https://www.ncbi.nlm.nih.gov/clinvar/variation/331895/
3,rs1553473041,Homo sapiens,2:166199288,GRCh38.p13,-,-,Deletion,Reported in ClinVar,SCN9A : Frameshift Variant; SCN1A-AS1 : Intron Variant,0 citations,https://www.ncbi.nlm.nih.gov/snp/rs1553473041,471143,https://www.ncbi.nlm.nih.gov/clinvar/variation/471143/
4,rs372210358,Homo sapiens,2:166199392,GRCh38.p13,C,G,SNV,Reported in ClinVar,SCN9A : Synonymous Variant; SCN1A-AS1 : Intron Variant,0 citations,https://www.ncbi.nlm.nih.gov/snp/rs372210358,892607,https://www.ncbi.nlm.nih.gov/clinvar/variation/892607/


In [8]:
df.index.name='#'
df.to_csv("output/info from variants.csv")

<br />

In [9]:
# play beep to denote completion of the program
import IPython.display as ipd
import numpy as np

# manually generated sound
t = 1  # time is seconds
beep = np.sin(2*np.pi*400*np.arange(10000*t)/10000)
ipd.Audio(beep, rate=10000, autoplay=True)


from datetime import datetime
print(datetime.now().strftime("%B %d, %H:%M:%S"))

October 16, 14:23:14
