In [3]:
#1
import requests

response = requests.get(
    "https://www.ebi.ac.uk/proteins/api/features",
    params={"accession": "P04637"},
    headers={"Accept": "application/json"}
)

if response.ok:
    data = response.json()
    features = data[0]['features']
    
    # уникальные типы  features
    unique_types = sorted(set(f['type'] for f in features))
    print("Уникальные типы  features:")
    for t in unique_types:
        print("-", t)
else:
    print("Ошибка запроса:", response.status_code)



Уникальные типы  features:
- BINDING
- CHAIN
- COMPBIAS
- CROSSLNK
- DNA_BIND
- HELIX
- MOD_RES
- MOTIF
- MUTAGEN
- REGION
- SITE
- STRAND
- TURN
- VARIANT
- VAR_SEQ


In [4]:
#2
import requests

def get_protein_sequence(accession):
    url = f"https://www.ebi.ac.uk/proteins/api/proteins/{accession}"
    headers = {"Accept": "application/json"}
    response = requests.get(url, headers=headers)

    if response.ok:
        data = response.json()
        return data.get("sequence", {}).get("sequence", "")
    else:
        print(f"Ошибка при запросе {accession}: {response.status_code}")
        return None

# Получаем последовательности
seq_APP = get_protein_sequence("P05067")   
seq_TAU = get_protein_sequence("P10636")   

# Проверяем и сравниваем длины
if seq_APP and seq_TAU:
    len_APP = len(seq_APP)
    len_TAU = len(seq_TAU)

    print(f"Длина APP (P05067): {len_APP} аминокислот")
    print(f"Длина Tau (P10636): {len_TAU} аминокислот")

    if len_APP > len_TAU:
        print(f"APP длиннее Tau на {len_APP - len_TAU} аминокислот.")
    elif len_TAU > len_APP:
        print(f"Tau длиннее APP на {len_TAU - len_APP} аминокислот.")
    else:
        print("Оба белка имеют одинаковую длину.")


Длина APP (P05067): 770 аминокислот
Длина Tau (P10636): 758 аминокислот
APP длиннее Tau на 12 аминокислот.


In [23]:
#3
import requests
import pandas as pd

def get_protein_info(accession):
    url = f"https://www.ebi.ac.uk/proteins/api/proteins/{accession}"
    headers = {"Accept": "application/json"}
    response = requests.get(url, headers=headers)

    if response.ok:
        data = response.json()
        # название белка
        protein_name = (
            data.get("protein", {})
                .get("recommendedName", {})
                .get("fullName", {})
                .get("value", "N/A")
        )
        # Последовательность
        sequence = data.get("sequence", {}).get("sequence", "")
        length = len(sequence)

        return {"accession": accession, "protein_name": protein_name, "sequence_length": length}

    else:
        print(f"Ошибка при запросе {accession}: {response.status_code}")
        return {"accession": accession, "protein_name": None, "sequence_length": None}


# 5 белков по выбору
accessions = ["P04637", "P05067", "P10636", "P68871", "P38398"]
# p53, APP, Tau, Hemoglobin subunit beta, BRCA1


proteins_data = [get_protein_info(acc) for acc in accessions]
df = pd.DataFrame(proteins_data)
print(df)




  accession                                 protein_name  sequence_length
0    P04637                   Cellular tumor antigen p53              393
1    P05067               Amyloid-beta precursor protein              770
2    P10636           Microtubule-associated protein tau              758
3    P68871                      Hemoglobin subunit beta              147
4    P38398  Breast cancer type 1 susceptibility protein             1863


In [10]:
#4
import requests

base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
    'db': 'pubmed',
    'term': 'COVID-19 variants',
    'retmode': 'json',
    'retmax': 10,        
    'sort': 'pub+date'   # сортировка по дате публикации (чтобы получить 10 последних)
}

response = requests.get(base_url, params=params)
data = response.json()

id_list = data['esearchresult']['idlist']
print("Найденные PubMed ID:", id_list)

# заголовки статей по их ID
summary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
params_summary = {
    'db': 'pubmed',
    'id': ','.join(id_list),
    'retmode': 'json'
}

response_summary = requests.get(summary_url, params=params_summary)
data_summary = response_summary.json()

print("\n10 самых последних статей по запросу 'COVID-19 variants':\n")
for uid in data_summary['result']['uids']:
    record = data_summary['result'][uid]
    title = record.get('title', 'Нет заголовка')
    print(f"{uid} — {title}")


Найденные PubMed ID: ['41194062', '41193798', '41193785', '41191496', '41191293', '41190654', '41188882', '41188857', '41188743', '41188711']

10 самых последних статей по запросу 'COVID-19 variants':

41194062 — Dutch participatory surveillance framework for evaluating evolutionary changes on SARS-CoV-2 affecting rapid diagnostic test sensitivity in 2022 - 2023.
41193798 — Assessing phylogenetic confidence at pandemic scales.
41193785 — Organoids in respiratory virus research: advances and perspectives.
41191496 — Fuel-Free Rolosense: Viral Sensing Using Diffusional Particle Tracking.
41191293 — "Just be honest with us": A qualitative analysis of Canadians' public trust during the COVID-19 pandemic.
41190654 — Tradeoffs in viral fitness driven by alternative entry pathways.
41188882 — Development of broadly neutralizing antibodies against Omicron variants from existing neutralizing antibodies in clinical trials.
41188857 — Differential immune profiles in elderly patients with non-seve

In [22]:
#5
import requests

def get_protein_name(accession):
    
    url = f"https://www.ebi.ac.uk/proteins/api/proteins/{accession}"
    headers = {"Accept": "application/json"}

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        protein_name = (
                data.get("protein", {})
                    .get("recommendedName", {})
                    .get("fullName", {})
                    .get("value")
            )
        return protein_name or "Название не найдено"
    elif response.status_code == 404:
        print(f"Белок с accession {accession} не найден (404).")
        return None
    else:
        print(f"Ошибка при запросе {accession}: код {response.status_code}")
        return None

print(get_protein_name("P04637"))  # p53
print(get_protein_name("P05067"))  # APP
print(get_protein_name("XYZ123"))  # несуществующий ID


Cellular tumor antigen p53
Amyloid-beta precursor protein
Ошибка при запросе XYZ123: код 400
None


In [21]:
#6
import requests
import os

with open("accessions.txt", "w") as f:
    f.write("P04637\n")  # p53
    f.write("P05067\n")  # APP
    f.write("P10636\n")  # Tau
    f.write("P68871\n")  # Hemoglobin beta
    f.write("P38398\n")  # BRCA1
    
def get_protein_info(accession):
    url = f"https://www.ebi.ac.uk/proteins/api/proteins/{accession}"
    headers = {"Accept": "application/json"}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()

        # Название белка
        protein_name = (
            data.get("protein", {})
                .get("recommendedName", {})
                .get("fullName", {})
                .get("value", "N/A")
        )

        # Таксономический ID
        taxid = data.get("texid", {}).get("taxid", "N/A")

        return protein_name, taxid

    elif response.status_code == 404:
        print(f"Белок {accession} не найден.")
        return None, None
    else:
        print(f"Ошибка {response.status_code} при запросе {accession}.")
        return None, None


with open("accessions.txt", "r") as f:
    accessions = [line.strip() for line in f if line.strip()]

results = []
for acc in accessions:
    name, taxid = get_protein_info(acc)
    if name and taxid:
        results.append(f"{acc}\t{name}\t{taxid}")

with open("proteins_info.txt", "w") as f:
    f.write("Accession\tProteinName\tTaxID\n")
    for line in results:
        f.write(line + "\n")


In [25]:
#7
import requests
import xml.etree.ElementTree as ET

# 1️⃣ Поиск последних статей по запросу "COVID-19 variants"
search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
search_params = {
    "db": "pubmed",
    "term": "COVID-19 variants",
    "retmode": "json",
    "retmax": 10,         
    "sort": "pub+date"   
}

search_response = requests.get(search_url, params=search_params)
data = search_response.json()
id_list = data["esearchresult"]["idlist"]

fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

for pmid in id_list:
    fetch_params = {
        "db": "pubmed",
        "id": pmid,
        "retmode": "xml"
    }
    fetch_response = requests.get(fetch_url, params=fetch_params)

    #Разбор XML и извлечение авторов
    root = ET.fromstring(fetch_response.text)
    authors = []
    for author in root.findall(".//Author"):
        last = author.findtext("LastName")
        fore = author.findtext("ForeName")
        if last and fore:
            authors.append(f"{fore} {last}")

    
    print(f"\nPMID {pmid}:")
    if authors:
        for a in authors:
            print(" -", a)
    else:
        print(" (нет данных об авторах)")



PMID 41196059:
 - John M Powers
 - Sarah R Leist
 - Naveenchandra Suryadevara
 - Seth J Zost
 - Elad Binshtein
 - Anfal Abdelgadir
 - Michael L Mallory
 - Caitlin E Edwards
 - Kendra L Gully
 - Miranda L Hubbard
 - Mark R Zweigart
 - Alexis B Bailey
 - Timothy P Sheahan
 - James E Crowe
 - Stephanie A Montgomery
 - Jack R Harkema
 - Ralph S Baric

PMID 41194062:
 - Eva Kozanli
 - Wanda Han
 - Tara Smit
 - Jordy de Bakker
 - Mansoer Elahi
 - Ryanne Jaarsma
 - Gesa Carstens
 - Albert Jan van Hoek
 - Dirk Eggink

PMID 41193798:
 - Nicola De Maio
 - Nhan Ly-Trong
 - Samuel Martin
 - Bui Quang Minh
 - Nick Goldman

PMID 41193785:
 - Xingling Li
 - Haiqing Xiao
 - Ming Zhou
 - Chuanlai Yang
 - Xinyi Yang
 - Tong Cheng
 - Lunzhi Yuan
 - Ningshao Xia

PMID 41191496:
 - Selma Piranej
 - Krista Jackson
 - Luona Zhang
 - Jacob Kæstel-Hansen
 - Frank Sommerhage
 - David DeRoo
 - Nikos S Hatzakis
 - Khalid Salaita

PMID 41191293:
 - Nazeem Muhajarine
 - Cory Neudorf
 - Fionnuala Braun
 - Khatira M

In [None]:
#8 bash
curl -s "https://www.ebi.ac.uk/proteins/api/proteins/P04637" | grep -i "phosphorylation"

In [None]:
#9 bash
curl -s "https://www.ebi.ac.uk/proteins/api/proteins/Q8WZ42" \
  | grep -o '"sequence":"[^"]*"' \
  | cut -d':' -f2 \
  | tr -d '"' \
  | fold -w 80

In [30]:
#11
import requests
import pandas as pd

gene = "APOE"  

url = "https://rest.uniprot.org/uniprotkb/search"
params = {
    "query": f"gene_exact:{gene} AND organism_id:9606",  
    "fields": "accession,id,protein_name,organism_name",  
    "format": "json",
    "size": 100
}

response = requests.get(url, params=params)
if not response.ok:
    raise SystemExit(f"Ошибка запроса: {response.status_code} — {response.text}")

data = response.json()

# Извлекаем данные из JSON
proteins = []
for entry in data.get("results", []):
    accession = entry.get("primaryAccession", "")
    protein_name = (
        entry.get("proteinDescription", {})
        .get("recommendedName", {})
        .get("fullName", {})
        .get("value", "N/A")
    )
    uniprot_id = entry.get("uniProtkbId", "")
    organism = entry.get("organism", {}).get("scientificName", "")
    link = f"https://www.uniprot.org/uniprotkb/{accession}"
    
    proteins.append({
        "Gene": gene,
        "Accession": accession,
        "UniProt_ID": uniprot_id,
        "Protein_Name": protein_name,
        "Organism": organism,
        "Link": link
    })

df = pd.DataFrame(proteins)
df.to_csv(f"{gene}_proteins.csv", index=False, encoding="utf-8")
print(df.head())


   Gene   Accession        UniProt_ID      Protein_Name      Organism  \
0  APOE      P02649        APOE_HUMAN  Apolipoprotein E  Homo sapiens   
1  APOE  A0A0S2Z3D5  A0A0S2Z3D5_HUMAN  Apolipoprotein E  Homo sapiens   
2  APOE      E7ERP7      E7ERP7_HUMAN  Apolipoprotein E  Homo sapiens   
3  APOE      H0Y7L5      H0Y7L5_HUMAN  Apolipoprotein E  Homo sapiens   
4  APOE      J9ZVQ3      J9ZVQ3_HUMAN  Apolipoprotein E  Homo sapiens   

                                           Link  
0      https://www.uniprot.org/uniprotkb/P02649  
1  https://www.uniprot.org/uniprotkb/A0A0S2Z3D5  
2      https://www.uniprot.org/uniprotkb/E7ERP7  
3      https://www.uniprot.org/uniprotkb/H0Y7L5  
4      https://www.uniprot.org/uniprotkb/J9ZVQ3  


In [34]:
#12
import requests
import pandas as pd

# Список белков (м.б. и 100)
accessions = ["P04637", "P05067", "P10636", "P68871", "P38398"]
query = " OR ".join([f"accession:{acc}" for acc in accessions])
url = "https://rest.uniprot.org/uniprotkb/search"
params = {
    "query": query,
    "fields": "accession,id,protein_name,length,organism_name",
    "format": "json",
    "size": 100
}

response = requests.get(url, params=params)
if not response.ok:
    raise SystemExit(f"Ошибка запроса: {response.status_code} — {response.text}")

data = response.json()

proteins = []
for entry in data.get("results", []):
    accession = entry.get("primaryAccession", "")
    protein_name = (
        entry.get("proteinDescription", {})
        .get("recommendedName", {})
        .get("fullName", {})
        .get("value", "N/A")
    )
    uniprot_id = entry.get("uniProtkbId", "")
    organism = entry.get("organism", {}).get("scientificName", "")
    length = entry.get("sequence", {}).get("length", "")
    link = f"https://www.uniprot.org/uniprotkb/{accession}"
    
    proteins.append({
        "Accession": accession,
        "UniProt_ID": uniprot_id,
        "Protein_Name": protein_name,
        "Organism": organism,
        "Length": length,
        "Link": link
    })

df = pd.DataFrame(proteins)
print(df)



  Accession   UniProt_ID                                 Protein_Name  \
0    P68871    HBB_HUMAN                      Hemoglobin subunit beta   
1    P38398  BRCA1_HUMAN  Breast cancer type 1 susceptibility protein   
2    P10636    TAU_HUMAN           Microtubule-associated protein tau   
3    P04637    P53_HUMAN                   Cellular tumor antigen p53   
4    P05067     A4_HUMAN               Amyloid-beta precursor protein   

       Organism  Length                                      Link  
0  Homo sapiens     147  https://www.uniprot.org/uniprotkb/P68871  
1  Homo sapiens    1863  https://www.uniprot.org/uniprotkb/P38398  
2  Homo sapiens     758  https://www.uniprot.org/uniprotkb/P10636  
3  Homo sapiens     393  https://www.uniprot.org/uniprotkb/P04637  
4  Homo sapiens     770  https://www.uniprot.org/uniprotkb/P05067  


In [36]:
#15
import requests

def get_first_protein_for_gene(gene_name): #Ищет белки человека по названию гена и возвращает первый UniProt ID
    url = "https://rest.uniprot.org/uniprotkb/search"
    params = {
        "query": f"gene_exact:{gene_name} AND organism_id:9606",
        "fields": "accession,id,protein_name",
        "format": "json",
        "size": 1  
    }
    response = requests.get(url, params=params)
    if not response.ok:
        print(f"Ошибка UniProt API: {response.status_code}")
        return None
    
    data = response.json()
    results = data.get("results", [])
    if not results:
        print(f"Белков для гена {gene_name} не найдено.")
        return None
    
    entry = results[0]
    accession = entry.get("primaryAccession")
    uniprot_id = entry.get("uniProtkbId")
    protein_name = (
        entry.get("proteinDescription", {})
        .get("recommendedName", {})
        .get("fullName", {})
        .get("value", "N/A")
    )
    
    print(f" Найден белок для гена {gene_name}:")
    print(f"  UniProt ID: {uniprot_id}")
    print(f"  Accession:  {accession}")
    print(f"  Название:   {protein_name}\n")
    return gene_name  # будем искать статьи по названию гена


def search_pubmed_articles(query): #Ищет статьи в PubMed, упоминающие данный ген
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "json",
        "retmax": 10,      
        "sort": "pub+date" 
    }
    response = requests.get(url, params=params)
    if not response.ok:
        print(f"Ошибка PubMed API: {response.status_code}")
        return []
    
    data = response.json()
    id_list = data.get("esearchresult", {}).get("idlist", [])
    return id_list


gene = input("Введите название гена: ").strip()

gene_query = get_first_protein_for_gene(gene)
if gene_query:
    article_ids = search_pubmed_articles(gene_query)
    if article_ids:
        print(f" Найдено {len(article_ids)} статей по гену {gene_query}:")
        print(", ".join(article_ids))
    else:
        print("Статьи не найдены.")


Введите название гена:  TP53


 Найден белок для гена TP53:
  UniProt ID: S4R334_HUMAN
  Accession:  S4R334
  Название:   Cellular tumor antigen p53

 Найдено 10 статей по гену TP53:
41196374, 41196329, 41195731, 41195522, 41194226, 41194193, 41194031, 41193733, 41193463, 41193453


In [None]:
# 16 bash
curl -s "https://www.ebi.ac.uk/proteins/api/proteins/P04637" -o protein.json
jq -r '.accession' protein.json

In [37]:
#17
# Запрос 1 - функции (features) белка BRCA1
import requests

url = "https://www.ebi.ac.uk/proteins/api/features"
params = {"accession": "P38398"}
headers = {"Accept": "application/json"}

response = requests.get(url, params=params, headers=headers)
data = response.json()

unique_features = sorted(set(f["type"] for f in data[0]["features"]))
print("Типы аннотированных участков BRCA1:")
for ftype in unique_features:
    print("-", ftype)

# Запрос 2 - длина последовательности 
url = "https://rest.uniprot.org/uniprotkb/P38398"
headers = {"Accept": "application/json"}

response = requests.get(url, headers=headers)
data = response.json()

name = (
    data.get("proteinDescription", {})
    .get("recommendedName", {})
    .get("fullName", {})
    .get("value", "N/A")
)
length = data.get("sequence", {}).get("length", "N/A")

print(f"Белок: {name}\nДлина последовательности: {length} аминокислот")

#Запрос 3 - Поиск статей в PubMed, где упоминается BRCA1
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
    "db": "pubmed",
    "term": "BRCA1 AND human",
    "retmode": "json",
    "retmax": 5,
    "sort": "pub+date"
}

response = requests.get(url, params=params)
articles = response.json()["esearchresult"]["idlist"]
print("Последние статьи о BRCA1:", articles)



Типы аннотированных участков BRCA1:
- CHAIN
- COMPBIAS
- CONFLICT
- CROSSLNK
- DOMAIN
- HELIX
- MOD_RES
- MUTAGEN
- REGION
- STRAND
- TURN
- VARIANT
- VAR_SEQ
- ZN_FING
Белок: Breast cancer type 1 susceptibility protein
Длина последовательности: 1863 аминокислот
Последние статьи о BRCA1: ['41194282', '41194193', '41192265', '41188832', '41186660']
