# Script d’extraction — CSV + API NCBI

#### A) Télécharger SEER depuis Zenodo

In [3]:
import requests
import zipfile
import io
import os
import pandas as pd

url = "https://zenodo.org/api/records/5120960/files-archive"

download_folder = r"C:\Projet_filrouge\oncobio_decision_analytics\Data\Interim"
os.makedirs(download_folder, exist_ok=True)

# télécharger le zip
r = requests.get(url, timeout=60)
r.raise_for_status()

# lire le zip en mémoire
z = zipfile.ZipFile(io.BytesIO(r.content))

print("Fichiers dans l'archive :")
print(z.namelist())

# extraire tout
z.extractall(download_folder)

Fichiers dans l'archive :
['SEER Breast Cancer Dataset.docx', 'SEER Breast Cancer Dataset .csv']


In [5]:
csv_path = r"C:\Projet_filrouge\oncobio_decision_analytics\Data\Interim\SEER Breast Cancer Dataset .csv"

df = pd.read_csv(csv_path)
print(df.shape)
print(df.head())

(4024, 16)
   Age                                              Race   \
0   43  Other (American Indian/AK Native, Asian/Pacifi...   
1   47  Other (American Indian/AK Native, Asian/Pacifi...   
2   67                                              White   
3   46                                              White   
4   63                                              White   

                   Marital Status  Unnamed: 3 T Stage  N Stage 6th Stage  \
0  Married (including common law)         NaN       T2      N3      IIIC   
1  Married (including common law)         NaN       T2      N2      IIIA   
2  Married (including common law)         NaN       T2      N1       IIB   
3                        Divorced         NaN       T1      N1       IIA   
4  Married (including common law)         NaN       T2      N2      IIIA   

                                 Grade   A Stage  Tumor Size Estrogen Status  \
0  Moderately differentiated; Grade II  Regional          40        Positive   
1  Mo

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     4024 non-null   int64  
 1   Race                    4024 non-null   object 
 2   Marital Status          4024 non-null   object 
 3   Unnamed: 3              0 non-null      float64
 4   T Stage                 4024 non-null   object 
 5   N Stage                 4024 non-null   object 
 6   6th Stage               4024 non-null   object 
 7   Grade                   4024 non-null   object 
 8   A Stage                 4024 non-null   object 
 9   Tumor Size              4024 non-null   int64  
 10  Estrogen Status         4024 non-null   object 
 11  Progesterone Status     4024 non-null   object 
 12  Regional Node Examined  4024 non-null   int64  
 13  Reginol Node Positive   4024 non-null   int64  
 14  Survival Months         4024 non-null   

#### B) Enrichir les biomarqueurs via NCBI E-utilities (Gene)

In [9]:
import time
import requests
import pandas as pd
import os
from typing import Optional

In [10]:
BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
EMAIL = "leilamourid001@gmail.com"
API_KEY = None


def ncbi_esearch_gene_id(symbol: str) -> Optional[str]:
    params = {
        "db": "gene",
        "term": f"{symbol}[Sym] AND Homo sapiens[Organism]",
        "retmode": "json",
        "email": EMAIL,
    }
    if API_KEY:
        params["api_key"] = API_KEY

    r = requests.get(BASE + "esearch.fcgi", params=params, timeout=30)
    r.raise_for_status()
    ids = r.json().get("esearchresult", {}).get("idlist", [])
    return ids[0] if ids else None


def ncbi_esummary_gene(gene_id: str, fallback_symbol: str) -> dict:
    params = {"db": "gene", "id": gene_id, "retmode": "json", "email": EMAIL}
    if API_KEY:
        params["api_key"] = API_KEY

    r = requests.get(BASE + "esummary.fcgi", params=params, timeout=30)
    r.raise_for_status()

    result = r.json().get("result", {})
    data = result.get(gene_id, {})

    symbol = data.get("nomenclaturesymbol") or data.get("name") or fallback_symbol

    return {
        "biomarker_symbol": fallback_symbol,
        "ncbi_gene_id": gene_id,
        "official_name": data.get("name"),
        "description": data.get("description"),
        "ncbi_symbol": symbol,
    }


biomarkers = ["TP53", "BRCA1", "BRCA2", "EGFR", "KRAS", "BRAF", "PIK3CA", "PTEN", "ALK", "ERBB2"]

rows = []
for sym in biomarkers:
    try:
        gid = ncbi_esearch_gene_id(sym)
        if gid:
            rows.append(ncbi_esummary_gene(gid, fallback_symbol=sym))
        else:
            rows.append({
                "biomarker_symbol": sym,
                "ncbi_gene_id": None,
                "official_name": None,
                "description": None,
                "ncbi_symbol": None,
            })

        time.sleep(0.35)

    except requests.RequestException as e:
        rows.append({
            "biomarker_symbol": sym,
            "ncbi_gene_id": None,
            "official_name": None,
            "description": f"ERROR: {e}",
            "ncbi_symbol": None,
        })


ref = pd.DataFrame(rows)

out_path = r"C:\Projet_filrouge\oncobio_decision_analytics\Data\Processed\biomarker_reference.csv"
os.makedirs(os.path.dirname(out_path), exist_ok=True)

ref.to_csv(out_path, index=False)
print("Saved:", out_path)

ref.head()


Saved: C:\Projet_filrouge\oncobio_decision_analytics\Data\Processed\biomarker_reference.csv


Unnamed: 0,biomarker_symbol,ncbi_gene_id,official_name,description,ncbi_symbol
0,TP53,7157,TP53,tumor protein p53,TP53
1,BRCA1,672,BRCA1,BRCA1 DNA repair associated,BRCA1
2,BRCA2,675,BRCA2,BRCA2 DNA repair associated,BRCA2
3,EGFR,1956,EGFR,epidermal growth factor receptor,EGFR
4,KRAS,3845,KRAS,"KRAS proto-oncogene, GTPase",KRAS


In [11]:
ref.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   biomarker_symbol  10 non-null     object
 1   ncbi_gene_id      10 non-null     object
 2   official_name     10 non-null     object
 3   description       10 non-null     object
 4   ncbi_symbol       10 non-null     object
dtypes: object(5)
memory usage: 528.0+ bytes


In [12]:
ref.shape

(10, 5)