In [1]:
import requests
import pandas as pd
import json


In [14]:

pd.set_option("display.max_columns", None)
current_directory = os.getcwd()
print("Current Directory:", current_directory)

Current Directory: /Users/jiwon/Desktop/research/AI-coscientist/code


In [7]:
def fetch_civic_v2_final_fixed():
    url = "https://civicdb.org/api/graphql"
    
    # ... on Variant 구문을 사용하여 인터페이스 내 gene 필드에 접근합니다.
    query = """
    {
      evidenceItems(diseaseId: 146, evidenceType: PREDICTIVE, status: ACCEPTED) {
        nodes {
          id
          evidenceLevel
          significance
          molecularProfile {
            name
            variants {
              ... on Variant {
                name
                gene {
                  name
                }
              }
            }
          }
          therapies {
            name
          }
        }
      }
    }
    """
    
    response = requests.post(url, json={'query': query})
    res_json = response.json()

    if 'errors' in res_json:
        print("Detailed Errors:", json.dumps(res_json['errors'], indent=2))
        return None

    nodes = res_json['data']['evidenceItems']['nodes']
    
    extracted = []
    for n in nodes:
        mp = n['molecularProfile']
        # variants 리스트에서 유전자 정보 추출
        gene_name = "N/A"
        variant_name = "N/A"
        
        if mp['variants']:
            v = mp['variants'][0]
            variant_name = v.get('name', 'N/A')
            if 'gene' in v and v['gene']:
                gene_name = v['gene'].get('name', 'N/A')
        
        extracted.append({
            'Gene': gene_name,
            'Variant': variant_name,
            'Molecular_Profile': mp['name'],
            'Drug_Therapy': ", ".join([t['name'] for t in n['therapies']]),
            'Significance': n['significance'],
            'Level': n['evidenceLevel']
        })
    
    df = pd.DataFrame(extracted)
    print(f">>> Successfully fetched {len(df)} records for BRCA.")
    return df

# 실행
df_civic = fetch_civic_v2_final_fixed()
if df_civic is not None:
    print(df_civic.head())

Detailed Errors: [
  {
    "message": "Field 'gene' doesn't exist on type 'Variant'",
    "locations": [
      {
        "line": 13,
        "column": 17
      }
    ],
    "path": [
      "query",
      "evidenceItems",
      "nodes",
      "molecularProfile",
      "variants",
      "... on Variant",
      "gene"
    ],
    "extensions": {
      "code": "undefinedField",
      "typeName": "Variant",
      "fieldName": "gene"
    }
  }
]


In [None]:
# CIViC 정기 데이터 릴리즈 URL (Evidence Items)
url = "https://civicdb.org/downloads/nightly/nightly-ClinicalEvidenceSummaries.tsv"

# 데이터 불러오기
df_all = pd.read_csv(url, sep='\t')
print(df_all.shape)


In [11]:
print(df_all.shape)
df_all.columns

(4765, 25)


Index(['molecular_profile', 'molecular_profile_id', 'disease', 'doid',
       'phenotypes', 'therapies', 'therapy_interaction_type', 'evidence_type',
       'evidence_direction', 'evidence_level', 'significance',
       'evidence_statement', 'citation_id', 'source_type', 'asco_abstract_id',
       'citation', 'nct_ids', 'rating', 'evidence_status', 'evidence_id',
       'variant_origin', 'last_review_date', 'evidence_civic_url',
       'molecular_profile_civic_url', 'is_flagged'],
      dtype='object')

In [15]:
df_all.head()

Unnamed: 0,molecular_profile,molecular_profile_id,disease,doid,phenotypes,therapies,therapy_interaction_type,evidence_type,evidence_direction,evidence_level,significance,evidence_statement,citation_id,source_type,asco_abstract_id,citation,nct_ids,rating,evidence_status,evidence_id,variant_origin,last_review_date,evidence_civic_url,molecular_profile_civic_url,is_flagged
0,JAK2 V617F,64,Lymphoid Leukemia,1037.0,,,,Diagnostic,Supports,B,Negative,JAK2 V617F is not associated with lymphoid leu...,16081687,PubMed,,"Levine et al., 2005",,4.0,accepted,1,Somatic,2023-01-09 21:46:26 UTC,https://civicdb.org/links/evidence_items/1,https://civicdb.org/links/molecular_profiles/64,False
1,PDGFRA D842V,99,Gastrointestinal Stromal Tumor,9253.0,,,,Diagnostic,Supports,B,Negative,GIST tumors harboring PDGFRA D842V mutation ar...,15146165,PubMed,,"Lasota et al., 2004",,3.0,accepted,2,Somatic,2023-01-09 21:46:27 UTC,https://civicdb.org/links/evidence_items/2,https://civicdb.org/links/molecular_profiles/99,False
2,DNMT3A R882,32,Acute Myeloid Leukemia,9119.0,,,,Diagnostic,Supports,B,Positive,Young AML patients (<60 years old) with DNMT3A...,22490330,PubMed,,"Ribeiro et al., 2012",,3.0,accepted,4,Somatic,2023-01-09 21:46:25 UTC,https://civicdb.org/links/evidence_items/4,https://civicdb.org/links/molecular_profiles/32,False
3,JAK2 V617F,64,Chronic Myeloid Leukemia,8552.0,,,,Diagnostic,Supports,B,Positive,JAK2 V617F is associated with myeloid malignan...,16081687,PubMed,,"Levine et al., 2005",,4.0,accepted,5,Somatic,2023-01-09 21:46:26 UTC,https://civicdb.org/links/evidence_items/5,https://civicdb.org/links/molecular_profiles/64,False
4,JAK2 V617F,64,Chronic Myeloid Leukemia,8552.0,,,,Diagnostic,Supports,B,Positive,JAK2 V617F is associated with myeloid neoplasm...,16081687,PubMed,,"Levine et al., 2005",,4.0,accepted,6,Somatic,2023-01-09 21:46:26 UTC,https://civicdb.org/links/evidence_items/6,https://civicdb.org/links/molecular_profiles/64,False


In [24]:
# df_all.to_csv('../data/civic_Pan_cancer.csv',index=False)



In [23]:
df_all['disease'].value_counts(dropna=False)
# df_all[df_all['disease']=='Colorectal Cancer']

disease
Von Hippel-Lindau Disease         626
Lung Non-small Cell Carcinoma     443
Colorectal Cancer                 349
Chronic Myeloid Leukemia          319
Cancer                            226
                                 ... 
Renal Wilms' Tumor                  1
Peritoneal Carcinoma                1
Malignant Anus Melanoma             1
Gliosarcoma                         1
Pancreatic Endocrine Carcinoma      1
Name: count, Length: 327, dtype: int64

In [17]:
# BRCA(Breast Cancer) 데이터만 필터링
df_civic = df_all[df_all['disease'] == 'Breast Cancer'].copy()
df_civic.to_csv('../data/civic_breast_cancer.csv',index=False)
print(df_civic.shape)
df_civic.head()
# 필요한 컬럼만 선택 (지원님 데이터와 합치기 위함)
# df_civic = df_civic[['gene', 'variant', 'drugs', 'clinical_significance', 'evidence_level']]
# print(f">>> {len(df_civic)} records loaded from TSV.")

(199, 25)


Unnamed: 0,molecular_profile,molecular_profile_id,disease,doid,phenotypes,therapies,therapy_interaction_type,evidence_type,evidence_direction,evidence_level,significance,evidence_statement,citation_id,source_type,asco_abstract_id,citation,nct_ids,rating,evidence_status,evidence_id,variant_origin,last_review_date,evidence_civic_url,molecular_profile_civic_url,is_flagged
191,CCND1 Overexpression,20,Breast Cancer,1612.0,,,,Diagnostic,Supports,B,Positive,Cyclin D1 overexpression is associated with th...,15961768,PubMed,,"Arnold et al., 2005",,4.0,accepted,221,,2023-01-09 21:46:25 UTC,https://civicdb.org/links/evidence_items/221,https://civicdb.org/links/molecular_profiles/20,False
199,AKT1 E17K,4,Breast Cancer,1612.0,,Akt Inhibitor MK2206,,Predictive,Does Not Support,D,Sensitivity/Response,Breast cancer cell lines with the AKT1 E17K mu...,23888070,PubMed,,"Beaver et al., 2013",,3.0,accepted,231,Somatic,2023-01-09 21:46:24 UTC,https://civicdb.org/links/evidence_items/231,https://civicdb.org/links/molecular_profiles/4,False
209,ERBB2 L755S,39,Breast Cancer,1612.0,,Lapatinib,,Predictive,Supports,D,Resistance,The L755S mutation was shown to confer resista...,23220880,PubMed,,"Bose et al., 2013",,5.0,accepted,241,Somatic,2023-01-09 21:46:26 UTC,https://civicdb.org/links/evidence_items/241,https://civicdb.org/links/molecular_profiles/39,False
210,ESR1 L536Q,46,Breast Cancer,1612.0,,Hormone Therapy,,Predictive,Supports,D,Resistance,MCF7 cell lines harboring the L536Q mutation i...,24185512,PubMed,,"Toy et al., 2013",,3.0,accepted,242,Somatic,2026-01-06 19:23:29 UTC,https://civicdb.org/links/evidence_items/242,https://civicdb.org/links/molecular_profiles/46,False
211,ESR1 D538G,47,Breast Cancer,1612.0,,Hormone Therapy,,Predictive,Supports,D,Resistance,MCF7 cell lines harboring the D538G mutation i...,24185512,PubMed,,"Toy et al., 2013",,3.0,accepted,243,Somatic,2023-01-09 21:46:26 UTC,https://civicdb.org/links/evidence_items/243,https://civicdb.org/links/molecular_profiles/47,False


In [12]:
df_civic['molecular_profile'].value_counts(dropna=False)

molecular_profile
FGFR1 Amplification                                                                                                                             17
PIK3CA Mutation                                                                                                                                  7
PIK3CA E542K                                                                                                                                     6
PTEN Loss                                                                                                                                        6
ERBB2 Amplification                                                                                                                              5
                                                                                                                                                ..
PIK3CA E545K OR PIK3CA E545Q OR PIK3CA E545A OR PIK3CA E545G OR PIK3CA E545V OR PIK3CA E545D OR PIK3

In [16]:
df_civic['therapies'].value_counts(dropna=False)


therapies
NaN                         49
Neratinib                   10
Pictilisib                   7
Trastuzumab                  7
Tamoxifen                    6
                            ..
Elacestrant                  1
Fulvestrant,Capivasertib     1
Erdafitinib                  1
Pemigatinib                  1
Futibatinib                  1
Name: count, Length: 78, dtype: int64

In [18]:
# df_tp53 = df_civic[df_civic["molecular_profile"].astype(str).str.contains("TP53", na=False)]
df_civic[df_civic["molecular_profile"].astype(str).str.contains("TP53", na=False)]


Unnamed: 0,molecular_profile,molecular_profile_id,disease,doid,phenotypes,therapies,therapy_interaction_type,evidence_type,evidence_direction,evidence_level,significance,evidence_statement,citation_id,source_type,asco_abstract_id,citation,nct_ids,rating,evidence_status,evidence_id,variant_origin,last_review_date,evidence_civic_url,molecular_profile_civic_url,is_flagged
275,TP53 R175H,116,Breast Cancer,1612.0,,Doxorubicin,,Predictive,Supports,D,Sensitivity/Response,Breast tumors in a mouse model with R172H muta...,22698404,PubMed,,"Jackson et al., 2012",,3.0,accepted,319,Somatic,2023-01-09 21:46:28 UTC,https://civicdb.org/links/evidence_items/319,https://civicdb.org/links/molecular_profiles/116,False
331,TP53 R175H,116,Breast Cancer,1612.0,,,,Prognostic,Supports,B,Poor Outcome,Breast cancer patients who harbor R175H mutati...,16489069,PubMed,,"Olivier et al., 2006",,3.0,accepted,389,Somatic,2023-01-09 21:46:28 UTC,https://civicdb.org/links/evidence_items/389,https://civicdb.org/links/molecular_profiles/116,False
332,TP53 R248Q,117,Breast Cancer,1612.0,,,,Prognostic,Supports,B,Poor Outcome,Breast cancer patients who harbor a R248Q muta...,16489069,PubMed,,"Olivier et al., 2006",,3.0,accepted,390,Somatic,2023-01-09 21:46:28 UTC,https://civicdb.org/links/evidence_items/390,https://civicdb.org/links/molecular_profiles/117,False
333,TP53 R248Q,117,Breast Cancer,1612.0,,,,Prognostic,Supports,B,Poor Outcome,In breast cancer patients harboring TP53 mutat...,9569050,PubMed,,"Berns et al., 1998",,3.0,accepted,391,Somatic,2023-01-09 21:46:28 UTC,https://civicdb.org/links/evidence_items/391,https://civicdb.org/links/molecular_profiles/117,False
334,TP53 R248W,118,Breast Cancer,1612.0,,,,Prognostic,Supports,B,Poor Outcome,In breast cancer patients harboring R248W muta...,16489069,PubMed,,"Olivier et al., 2006",,3.0,accepted,392,Somatic,2023-01-09 21:46:28 UTC,https://civicdb.org/links/evidence_items/392,https://civicdb.org/links/molecular_profiles/118,False
335,TP53 Mutation,222,Breast Cancer,1612.0,,,,Prognostic,Supports,B,Poor Outcome,In breast cancer patients harboring TP53 mutat...,9569050,PubMed,,"Berns et al., 1998",,3.0,accepted,393,Somatic,2025-12-16 09:25:48 UTC,https://civicdb.org/links/evidence_items/393,https://civicdb.org/links/molecular_profiles/222,False
336,TP53 R273C,121,Breast Cancer,1612.0,,,,Prognostic,Supports,B,Poor Outcome,Breast cancer patients who harbor R273C mutati...,16489069,PubMed,,"Olivier et al., 2006",,3.0,accepted,395,Somatic,2023-01-09 21:46:28 UTC,https://civicdb.org/links/evidence_items/395,https://civicdb.org/links/molecular_profiles/121,False
337,TP53 R273C,121,Breast Cancer,1612.0,,,,Prognostic,Supports,B,Poor Outcome,In breast cancer patients harboring TP53 mutat...,9569050,PubMed,,"Berns et al., 1998",,3.0,accepted,396,Somatic,2023-01-09 21:46:28 UTC,https://civicdb.org/links/evidence_items/396,https://civicdb.org/links/molecular_profiles/121,False
338,TP53 R273H,122,Breast Cancer,1612.0,,,,Prognostic,Supports,B,Poor Outcome,Breast cancer patients who harbor R273H mutati...,16489069,PubMed,,"Olivier et al., 2006",,3.0,accepted,397,Somatic,2023-01-09 21:46:28 UTC,https://civicdb.org/links/evidence_items/397,https://civicdb.org/links/molecular_profiles/122,False
339,TP53 R273H,122,Breast Cancer,1612.0,,,,Prognostic,Supports,B,Poor Outcome,In breast cancer patients harboring TP53 mutat...,9569050,PubMed,,"Berns et al., 1998",,3.0,accepted,398,Somatic,2023-01-09 21:46:28 UTC,https://civicdb.org/links/evidence_items/398,https://civicdb.org/links/molecular_profiles/122,False
