In [2]:
import pandas as pd
df = pd.read_csv("https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz", 
                 compression='gzip', 
                 sep='\t', 
                 low_memory=False)

print("الشكل العام للبيانات:")
df.head()

الشكل العام للبيانات:


Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,AlternateAlleleVCF,SomaticClinicalImpact,SomaticClinicalImpactLastEvaluated,ReviewStatusClinicalImpact,Oncogenicity,OncogenicityLastEvaluated,ReviewStatusOncogenicity,SCVsForAggregateGermlineClassification,SCVsForAggregateSomaticClinicalImpact,SCVsForAggregateOncogenicityClassification
0,15041,Indel,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,9907,AP5Z1,HGNC:22197,Pathogenic/Likely pathogenic,1,"Dec 17, 2024",397704705,...,TGCTGTAAACTGTAACTGTAAA,-,-,-,-,-,-,SCV001451119|SCV005622007|SCV005909190,-,-
1,15041,Indel,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,9907,AP5Z1,HGNC:22197,Pathogenic/Likely pathogenic,1,"Dec 17, 2024",397704705,...,TGCTGTAAACTGTAACTGTAAA,-,-,-,-,-,-,SCV001451119|SCV005622007|SCV005909190,-,-
2,15042,Deletion,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704709,...,G,-,-,-,-,-,-,SCV000020156,-,-
3,15042,Deletion,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704709,...,G,-,-,-,-,-,-,SCV000020156,-,-
4,15043,single nucleotide variant,NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,"Jun 29, 2015",150829393,...,A,-,-,-,-,-,-,SCV000020157,-,-


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7452145 entries, 0 to 7452144
Data columns (total 43 columns):
 #   Column                                      Dtype 
---  ------                                      ----- 
 0   #AlleleID                                   int64 
 1   Type                                        object
 2   Name                                        object
 3   GeneID                                      int64 
 4   GeneSymbol                                  object
 5   HGNC_ID                                     object
 6   ClinicalSignificance                        object
 7   ClinSigSimple                               int64 
 8   LastEvaluated                               object
 9   RS# (dbSNP)                                 int64 
 10  nsv/esv (dbVar)                             object
 11  RCVaccession                                object
 12  PhenotypeIDS                                object
 13  PhenotypeList                             

In [3]:
# -*- coding: utf-8 -*-
"""
Extract Benign / Likely benign variants from ClinVar's variant_summary.txt.gz
- Keeps ONLY: Oncogenicity, Name, Type, OriginSimple, Chromosome, ReferenceAllele, AlternateAllele
- Chunked processing to save RAM
"""

import re
from datetime import datetime
from typing import Optional

import pandas as pd


URL = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"


OUT_TSV = f"clinvar_benign_likebenign_{datetime.now():%Y%m%d}.tsv"  


CHUNK = 500_000  


USECOLS = [
    "Name", "Type", "OriginSimple", "Chromosome",
    "ReferenceAllele", "AlternateAllele", "ClinicalSignificance",
]

def map_oncogenicity(clinsig: str) -> Optional[str]:
    """
    يحوّل ClinicalSignificance النصي إلى Oncogenicity مستهدفة:
    - 'Benign' أو 'Likely benign' فقط
    - يستبعد أي سجلات فيها كلمات غير مرغوبة (pathogenic/VUS/conflicting/..)
    """
    if not isinstance(clinsig, str):
        return None

    text = clinsig.strip().lower()
    
    text = re.sub(r'[\/,;|]+', ' ', text)
    text = re.sub(r'[_\s]+', ' ', text).strip()

    
    bad_terms = (
        "pathogenic", "likely pathogenic", "uncertain significance",
        "conflicting", "risk factor", "drug response", "protective",
        "affects", "association", "not provided"
    )
    if any(b in text for b in bad_terms):
        return None

    has_likely = ("likely benign" in text) or ("likely_benign" in text)
    has_benign = re.search(r'(^|[^a-z])benign([^a-z]|$)', text) is not None

    
    if has_benign and has_likely:
        return "Benign"
    if has_likely:
        return "Likely benign"
    if has_benign:
        return "Benign"

    return None

def process():
    total_in = total_out = 0
    first_write = True

    reader = pd.read_csv(
        URL,
        sep="\t",
        compression="gzip",
        usecols=USECOLS,
        chunksize=CHUNK,
        dtype=str,
        low_memory=False
    )

    for chunk in reader:
        total_in += len(chunk)

        
        chunk["ClinicalSignificance"] = chunk["ClinicalSignificance"].fillna("").str.strip()

        
        chunk["Oncogenicity"] = chunk["ClinicalSignificance"].apply(map_oncogenicity)

       
        df = chunk[chunk["Oncogenicity"].notna()].copy()

        
        df_out = df[[
            "Oncogenicity", "Name", "Type", "OriginSimple",
            "Chromosome", "ReferenceAllele", "AlternateAllele"
        ]]

        if len(df_out):
            df_out.to_csv(
                OUT_TSV,
                sep="\t",
                index=False,
                mode="w" if first_write else "a",
                header=first_write
            )
            first_write = False
            total_out += len(df_out)

        print(f"Processed {total_in:,} rows -> kept {total_out:,} benign/LB rows so far...")

    print(f"\nDONE. Wrote {total_out:,} rows to {OUT_TSV}")

if __name__ == "__main__":
    process()


Processed 500,000 rows -> kept 142,346 benign/LB rows so far...
Processed 1,000,000 rows -> kept 275,037 benign/LB rows so far...
Processed 1,500,000 rows -> kept 532,384 benign/LB rows so far...
Processed 2,000,000 rows -> kept 740,324 benign/LB rows so far...
Processed 2,500,000 rows -> kept 980,874 benign/LB rows so far...
Processed 3,000,000 rows -> kept 1,288,142 benign/LB rows so far...
Processed 3,500,000 rows -> kept 1,471,697 benign/LB rows so far...
Processed 4,000,000 rows -> kept 1,652,686 benign/LB rows so far...
Processed 4,500,000 rows -> kept 1,683,299 benign/LB rows so far...
Processed 5,000,000 rows -> kept 1,867,054 benign/LB rows so far...
Processed 5,500,000 rows -> kept 2,206,895 benign/LB rows so far...
Processed 6,000,000 rows -> kept 2,280,911 benign/LB rows so far...
Processed 6,500,000 rows -> kept 2,332,934 benign/LB rows so far...
Processed 7,000,000 rows -> kept 2,505,172 benign/LB rows so far...
Processed 7,448,531 rows -> kept 2,545,075 benign/LB rows so

In [6]:
import pandas as pd
df_kn = pd.read_csv("clinvar_benign_likebenign_20250809.tsv", sep="\t", dtype=str)
df_kn.head()
df_kn["Oncogenicity"].value_counts()


Benign    2545075
Name: Oncogenicity, dtype: int64

In [2]:
import pandas as pd
df_kn = pd.read_csv("clinvar_benign_likebenign_20250809.tsv", sep="\t", dtype=str)
df_kn

Unnamed: 0,Oncogenicity,Name,Type,OriginSimple,Chromosome,ReferenceAllele,AlternateAllele
0,Benign,NM_000410.4(HFE):c.892+48G>A,single nucleotide variant,germline,6,na,na
1,Benign,NM_000410.4(HFE):c.892+48G>A,single nucleotide variant,germline,6,na,na
2,Benign,NC_000007.14:g.128775556C>T,single nucleotide variant,germline,7,na,na
3,Benign,NC_000007.14:g.128775556C>T,single nucleotide variant,germline,7,na,na
4,Benign,NM_000355.4(TCN2):c.776G>C (p.Arg259Pro),single nucleotide variant,germline,22,na,na
...,...,...,...,...,...,...,...
2545070,Benign,NM_015338.6(ASXL1):c.4469T>C (p.Leu1490Ser),single nucleotide variant,germline,20,na,na
2545071,Benign,NM_004006.3(DMD):c.3202G>A (p.Asp1068Asn),single nucleotide variant,unknown,X,na,na
2545072,Benign,NM_004006.3(DMD):c.3202G>A (p.Asp1068Asn),single nucleotide variant,unknown,X,na,na
2545073,Benign,NM_001257291.2(SLC9A7):c.935C>A (p.Thr312Asn),single nucleotide variant,germline,X,na,na


In [7]:
df_kn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2545075 entries, 0 to 2545074
Data columns (total 7 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   Oncogenicity     object
 1   Name             object
 2   Type             object
 3   OriginSimple     object
 4   Chromosome       object
 5   ReferenceAllele  object
 6   AlternateAllele  object
dtypes: object(7)
memory usage: 135.9+ MB


In [9]:
print(df_saratani['Oncogenicity'].value_counts())

Oncogenic    663330
Name: Oncogenicity, dtype: int64


In [8]:
print(df_kn['ReferenceAllele'].value_counts())

na       2545059
CTG            2
AT             2
G              2
C              2
CAG            2
-              1
A              1
TTG            1
AAAAT          1
AAAA           1
T              1
Name: ReferenceAllele, dtype: int64


In [10]:

"""
Extract Oncogenic / Likely oncogenic variants from ClinVar's variant_summary.txt.gz
Outputs columns aligned with your training:
Oncogenicity, Name, Type, OriginSimple, Chromosome, ReferenceAllele, AlternateAllele
"""

import re
from datetime import datetime
from typing import Optional
import pandas as pd


URL = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"


OUT_TSV = f"clinvar_oncogenic_likely_{datetime.now():%Y%m%d}.tsv"


CHUNK = 500_000 


USECOLS = [
    "Name", "Type", "OriginSimple", "Chromosome",
    "ReferenceAllele", "AlternateAllele", "ClinicalSignificance",
]

def map_oncogenicity_pos(clinsig: str) -> Optional[str]:
    """
    Map ClinVar ClinicalSignificance to {Oncogenic, Likely oncogenic}
    Excludes benign/uncertain/conflicting/risk/drug/protective/association/affects/not provided.
    """
    if not isinstance(clinsig, str):
        return None

    text = clinsig.strip().lower()
  
    text = re.sub(r'[\/,;|]+', ' ', text)
    text = re.sub(r'[_\s]+', ' ', text).strip()

    
    bad_terms = (
        "uncertain significance", "conflicting", "not provided",
        "risk factor", "drug response", "protective",
        "association", "affects"
    )
    if any(b in text for b in bad_terms):
        return None
  
    if "benign" in text:
        return None

    has_likely = ("likely pathogenic" in text) or ("likely_pathogenic" in text)
   
    has_path = re.search(r'(^|[^a-z])pathogenic([^a-z]|$)', text) is not None

    if has_path and has_likely:
        return "Oncogenic"
    if has_path:
        return "Oncogenic"
    if has_likely:
        return "Likely oncogenic"
    return None

def process():
    total_in = total_out = 0
    first_write = True

    reader = pd.read_csv(
        URL,
        sep="\t",
        compression="gzip",
        usecols=USECOLS,
        chunksize=CHUNK,
        dtype=str,
        low_memory=False
    )

    for chunk in reader:
        total_in += len(chunk)

        chunk["ClinicalSignificance"] = chunk["ClinicalSignificance"].fillna("").str.strip()
        chunk["Oncogenicity"] = chunk["ClinicalSignificance"].apply(map_oncogenicity_pos)

        df = chunk[chunk["Oncogenicity"].notna()].copy()

        df_out = df[[
            "Oncogenicity", "Name", "Type", "OriginSimple",
            "Chromosome", "ReferenceAllele", "AlternateAllele"
        ]]

        if len(df_out):
            df_out.to_csv(
                OUT_TSV,
                sep="\t",
                index=False,
                mode="w" if first_write else "a",
                header=first_write
            )
            first_write = False
            total_out += len(df_out)

        print(f"Processed {total_in:,} rows -> kept {total_out:,} oncogenic/Likely oncogenic rows so far...")

    print(f"\nDONE. Wrote {total_out:,} rows to {OUT_TSV}")

if __name__ == "__main__":
    process()


Processed 500,000 rows -> kept 111,807 oncogenic/Likely oncogenic rows so far...
Processed 1,000,000 rows -> kept 211,369 oncogenic/Likely oncogenic rows so far...
Processed 1,500,000 rows -> kept 252,494 oncogenic/Likely oncogenic rows so far...
Processed 2,000,000 rows -> kept 316,718 oncogenic/Likely oncogenic rows so far...
Processed 2,500,000 rows -> kept 357,866 oncogenic/Likely oncogenic rows so far...
Processed 3,000,000 rows -> kept 388,277 oncogenic/Likely oncogenic rows so far...
Processed 3,500,000 rows -> kept 433,578 oncogenic/Likely oncogenic rows so far...
Processed 4,000,000 rows -> kept 461,955 oncogenic/Likely oncogenic rows so far...
Processed 4,500,000 rows -> kept 474,724 oncogenic/Likely oncogenic rows so far...
Processed 5,000,000 rows -> kept 530,529 oncogenic/Likely oncogenic rows so far...
Processed 5,500,000 rows -> kept 564,313 oncogenic/Likely oncogenic rows so far...
Processed 6,000,000 rows -> kept 589,112 oncogenic/Likely oncogenic rows so far...
Proces

In [4]:
import pandas as pd
df_saratani = pd.read_csv("clinvar_oncogenic_likely_20250810.tsv", sep="\t", dtype=str)
df_saratani

Unnamed: 0,Oncogenicity,Name,Type,OriginSimple,Chromosome,ReferenceAllele,AlternateAllele
0,Oncogenic,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,Indel,germline,7,na,na
1,Oncogenic,NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...,Indel,germline,7,na,na
2,Oncogenic,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),Deletion,germline,7,na,na
3,Oncogenic,NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs),Deletion,germline,7,na,na
4,Oncogenic,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),single nucleotide variant,germline,11,na,na
...,...,...,...,...,...,...,...
663325,Oncogenic,NM_002181.4(IHH):c.1018G>A (p.Val340Met),single nucleotide variant,germline,2,na,na
663326,Oncogenic,NM_004239.4(TRIP11):c.5272C>T (p.Gln1758Ter),single nucleotide variant,germline,14,na,na
663327,Oncogenic,NM_004239.4(TRIP11):c.5272C>T (p.Gln1758Ter),single nucleotide variant,germline,14,na,na
663328,Oncogenic,NM_000376.3(VDR):c.379G>T (p.Glu127Ter),single nucleotide variant,germline,12,na,na


In [5]:
df_saratani.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 663330 entries, 0 to 663329
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Oncogenicity     663330 non-null  object
 1   Name             663330 non-null  object
 2   Type             663330 non-null  object
 3   OriginSimple     663330 non-null  object
 4   Chromosome       663330 non-null  object
 5   ReferenceAllele  663330 non-null  object
 6   AlternateAllele  663330 non-null  object
dtypes: object(7)
memory usage: 35.4+ MB


In [6]:
print(df_saratani['Oncogenicity'].value_counts())

Oncogenic    663330
Name: Oncogenicity, dtype: int64


In [7]:
print(df_saratani['ReferenceAllele'].value_counts())


na                             663246
-                                  65
CGCGGGGCGGGG                        2
T                                   2
ATTCT                               2
GGCCTG                              2
CAG                                 2
C                                   2
GGC                                 2
GGAAAGCATCTCTGGCTCACCATGTAA         1
GAGTTACAATTTCGATG                   1
CC                                  1
G                                   1
A                                   1
Name: ReferenceAllele, dtype: int64


In [2]:
import pandas as pd
import numpy as np

BENIGN_TSV = "clinvar_benign_likebenign_20250809.tsv"
ONCO_TSV   = "clinvar_oncogenic_likely_20250810.tsv"

SAMPLE_SIZE = 600_000
RANDOM_SEED = 42

# --- load benign ---
ben = pd.read_csv(BENIGN_TSV, sep="\t", low_memory=False)
if len(ben) > SAMPLE_SIZE:
    ben = ben.sample(SAMPLE_SIZE, random_state=RANDOM_SEED)
ben["y"] = 0

# --- load oncogenic ---
onc = pd.read_csv(ONCO_TSV, sep="\t", low_memory=False)
if len(onc) > SAMPLE_SIZE:
    onc = onc.sample(SAMPLE_SIZE, random_state=RANDOM_SEED)
onc["y"] = 1

# --- merge + shuffle ---
df = pd.concat([ben, onc], ignore_index=True)
df = df.sample(frac=1.0, random_state=RANDOM_SEED).reset_index(drop=True)

print(df.shape, df["y"].value_counts())

# --- save to file ---
df.to_csv("clinvar_balanced_600k_600k.tsv", sep="\t", index=False)
print("✅ Wrote balanced sample to clinvar_balanced_600k_600k.tsv")


(1200000, 8) 0    600000
1    600000
Name: y, dtype: int64
✅ Wrote balanced sample to clinvar_balanced_600k_600k.tsv


In [2]:
ben.info()

NameError: name 'ben' is not defined

In [11]:
onc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600000 entries, 11813 to 24438
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Oncogenicity     600000 non-null  object
 1   Name             600000 non-null  object
 2   Type             600000 non-null  object
 3   OriginSimple     600000 non-null  object
 4   Chromosome       600000 non-null  object
 5   ReferenceAllele  600000 non-null  object
 6   AlternateAllele  600000 non-null  object
 7   y                600000 non-null  int64 
dtypes: int64(1), object(7)
memory usage: 41.2+ MB


In [3]:
df.info()

NameError: name 'df' is not defined

In [4]:
df.head()

Unnamed: 0,Oncogenicity,Name,Type,OriginSimple,Chromosome,ReferenceAllele,AlternateAllele,y
0,Benign,NM_002645.4(PIK3C2A):c.1721T>C (p.Val574Ala),single nucleotide variant,germline,11,na,na,0
1,Benign,NM_004082.5(DCTN1):c.2754C>T (p.Pro918=),single nucleotide variant,germline,2,na,na,0
2,Benign,NM_017799.4(TMEM260):c.2036C>T (p.Pro679Leu),single nucleotide variant,germline,14,na,na,0
3,Oncogenic,NM_000085.5(CLCNKB):c.1846-1G>A,single nucleotide variant,germline,1,na,na,1
4,Benign,NM_000188.3(HK1):c.375+15C>T,single nucleotide variant,germline,10,na,na,0


In [5]:
print(df['y'].value_counts())

0    600000
1    600000
Name: y, dtype: int64


In [1]:
import pandas as pd
df = pd.read_csv("clinvar_balanced_600k_600k.tsv", 
                  
                 sep='\t', 
                 low_memory=False)

print("الشكل العام للبيانات:")
df.head()

الشكل العام للبيانات:


Unnamed: 0,Oncogenicity,Name,Type,OriginSimple,Chromosome,ReferenceAllele,AlternateAllele,y
0,Benign,NM_002645.4(PIK3C2A):c.1721T>C (p.Val574Ala),single nucleotide variant,germline,11,na,na,0
1,Benign,NM_004082.5(DCTN1):c.2754C>T (p.Pro918=),single nucleotide variant,germline,2,na,na,0
2,Benign,NM_017799.4(TMEM260):c.2036C>T (p.Pro679Leu),single nucleotide variant,germline,14,na,na,0
3,Oncogenic,NM_000085.5(CLCNKB):c.1846-1G>A,single nucleotide variant,germline,1,na,na,1
4,Benign,NM_000188.3(HK1):c.375+15C>T,single nucleotide variant,germline,10,na,na,0


In [2]:
import re

def parse_hgvs(name):
    """
    Extract ref and alt alleles from HGVS-like strings.
    Works for SNVs, splice site notations, etc.
    """
    if not isinstance(name, str):
        return None, None
    
    # simple SNV pattern: c.XXXXREF>ALT
    m = re.search(r"c\.[0-9+-]*([ACGT])>([ACGT])", name)
    if m:
        return m.group(1), m.group(2)
    
    # fallback: if it's like c.1234delA / c.1234insG
    m = re.search(r"c\.[0-9+-]*del([ACGT]+)", name)
    if m:
        return m.group(1), "-"
    m = re.search(r"c\.[0-9+-]*ins([ACGT]+)", name)
    if m:
        return "-", m.group(1)
    
    return None, None

df[["Ref_from_Name", "Alt_from_Name"]] = df["Name"].apply(lambda x: pd.Series(parse_hgvs(x)))


In [3]:
df.head()

Unnamed: 0,Oncogenicity,Name,Type,OriginSimple,Chromosome,ReferenceAllele,AlternateAllele,y,Ref_from_Name,Alt_from_Name
0,Benign,NM_002645.4(PIK3C2A):c.1721T>C (p.Val574Ala),single nucleotide variant,germline,11,na,na,0,T,C
1,Benign,NM_004082.5(DCTN1):c.2754C>T (p.Pro918=),single nucleotide variant,germline,2,na,na,0,C,T
2,Benign,NM_017799.4(TMEM260):c.2036C>T (p.Pro679Leu),single nucleotide variant,germline,14,na,na,0,C,T
3,Oncogenic,NM_000085.5(CLCNKB):c.1846-1G>A,single nucleotide variant,germline,1,na,na,1,G,A
4,Benign,NM_000188.3(HK1):c.375+15C>T,single nucleotide variant,germline,10,na,na,0,C,T


In [6]:
print(df['Ref_from_Name'].value_counts())

C                   303968
G                   286318
T                   141360
A                   132075
-                        8
CT                       1
ACCGCTCGGCCCCCAG         1
TTCC                     1
Name: Ref_from_Name, dtype: int64


In [7]:
print(df['Alt_from_Name'].value_counts())

T     301843
A     249540
G     160349
C     151764
-        235
TG         1
Name: Alt_from_Name, dtype: int64


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864224 entries, 0 to 864223
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Oncogenicity     864224 non-null  object 
 1   Name             864224 non-null  object 
 2   Type             864224 non-null  object 
 3   OriginSimple     864224 non-null  object 
 4   Chromosome       864224 non-null  object 
 5   ReferenceAllele  863816 non-null  object 
 6   AlternateAllele  864164 non-null  object 
 7   y                864224 non-null  int64  
 8   ref_len          863816 non-null  float64
 9   alt_len          864164 non-null  float64
 10  is_del           864224 non-null  int8   
 11  is_ins           864224 non-null  int8   
 12  is_snv           864224 non-null  int8   
 13  is_mnv           864224 non-null  int8   
 14  is_transition    864224 non-null  int64  
 15  indel_len        864224 non-null  int32  
dtypes: float64(2), int32(1), int64(2), int

In [9]:
print(df['OriginSimple'].value_counts())

germline            1130303
unknown               63832
germline/somatic       2234
somatic                2099
not provided           1473
not applicable           59
Name: OriginSimple, dtype: int64


In [12]:
print(df['Oncogenicity'].value_counts())

Benign       551425
Oncogenic    312799
Name: Oncogenicity, dtype: int64


In [13]:
df.head()

Unnamed: 0,Oncogenicity,Name,Type,OriginSimple,Chromosome,ReferenceAllele,AlternateAllele,y,ref_len,alt_len,is_del,is_ins,is_snv,is_mnv,is_transition,indel_len
0,Benign,NM_002645.4(PIK3C2A):c.1721T>C (p.Val574Ala),single nucleotide variant,germline,11,T,C,0,1.0,1.0,0,0,1,0,1,0
1,Benign,NM_004082.5(DCTN1):c.2754C>T (p.Pro918=),single nucleotide variant,germline,2,C,T,0,1.0,1.0,0,0,1,0,1,0
2,Benign,NM_017799.4(TMEM260):c.2036C>T (p.Pro679Leu),single nucleotide variant,germline,14,C,T,0,1.0,1.0,0,0,1,0,1,0
3,Oncogenic,NM_000085.5(CLCNKB):c.1846-1G>A,single nucleotide variant,germline,1,G,A,1,1.0,1.0,0,0,1,0,1,0
4,Benign,NM_000188.3(HK1):c.375+15C>T,single nucleotide variant,germline,10,C,T,0,1.0,1.0,0,0,1,0,1,0


In [17]:
print(df['ref_len'].value_counts())

1.0     863798
3.0          6
2.0          3
5.0          2
12.0         2
16.0         1
27.0         1
4.0          1
6.0          1
17.0         1
Name: ref_len, dtype: int64


In [18]:
print(df['AlternateAllele'].value_counts())

T                       301849
A                       249545
G                       160356
C                       151767
-                          612
CA                           4
AATTAAGGTATA                 2
GG                           2
TG                           2
TCCCGGGTTCAAGCGATTCT         2
TTTCCGACAAAGGT               2
ATAAATCACTTAGAGATGT          2
AC                           2
GTGG                         2
TGACATCAGTCCGGGCAC           2
TC                           2
CTA                          2
CACAAAGTG                    2
TTGAA                        2
TGG                          1
AGTTACC                      1
ACC                          1
CTTA                         1
TCT                          1
Name: AlternateAllele, dtype: int64


In [21]:
print(df['ReferenceAllele'].value_counts())

C                              303972
G                              286320
T                              141362
A                              132075
-                                  69
CAG                                 3
ATTCT                               2
CGCGGGGCGGGG                        2
GGC                                 2
GGAAAGCATCTCTGGCTCACCATGTAA         1
CC                                  1
GGCCTG                              1
TTCC                                1
CTG                                 1
ACCGCTCGGCCCCCAG                    1
CT                                  1
AT                                  1
GAGTTACAATTTCGATG                   1
Name: ReferenceAllele, dtype: int64


In [16]:
print(df['ReferenceAllele'].value_counts())

C                              303972
G                              286320
T                              141362
A                              132075
-                                  69
CAG                                 3
ATTCT                               2
CGCGGGGCGGGG                        2
GGC                                 2
GGAAAGCATCTCTGGCTCACCATGTAA         1
CC                                  1
GGCCTG                              1
TTCC                                1
CTG                                 1
ACCGCTCGGCCCCCAG                    1
CT                                  1
AT                                  1
GAGTTACAATTTCGATG                   1
Name: ReferenceAllele, dtype: int64


In [22]:
import re, numpy as np, pandas as pd

# ---------------------------
# 1) Normalize parsed alleles
# ---------------------------
def norm_token(x):
    if x is None: return np.nan
    s = str(x).strip()
    if s.lower() in {"", "na", "nan", "none"}:
        return np.nan
    return s.upper()


df["ReferenceAllele"] = df["ReferenceAllele"].map(norm_token)
df["AlternateAllele"] = df["AlternateAllele"].map(norm_token)

# ------------------------------------------------
# 2) Robust HGVS parser (c. level) from Name col
#    handles SNV, del, ins, delins (basic cases)
# ------------------------------------------------
snv_re     = re.compile(r"c\.[0-9]+[+-]?[0-9]*([ACGT])>([ACGT])", re.I)
del_re     = re.compile(r"c\.[0-9]+(?:_[0-9]+)?del([ACGT]+)?", re.I)   # 'del' with optional ref bases
ins_re     = re.compile(r"c\.[0-9]+(?:_[0-9]+)?ins([ACGT]+)", re.I)
delins_re  = re.compile(r"c\.[0-9]+(?:_[0-9]+)?delins([ACGT]+)", re.I)

def parse_hgvs_alleles(name: str):
    if not isinstance(name, str): return (np.nan, np.nan, None)
    m = snv_re.search(name)
    if m:  # SNV
        return (m.group(1).upper(), m.group(2).upper(), "SNV")
    m = delins_re.search(name)
    if m:
        alt = m.group(1).upper()
        # 'delins' means some unknown ref replaced by ALT; mark as DELINS (ref unknown length)
        return ("-", alt, "DELINS")
    m = del_re.search(name)
    if m:
        ref = m.group(1).upper() if m.group(1) else "-"  # sometimes ClinVar omits explicit ref
        return (ref, "-", "DEL")
    m = ins_re.search(name)
    if m:
        alt = m.group(1).upper()
        return ("-", alt, "INS")
    return (np.nan, np.nan, None)

# Only fill where alleles are still missing
need_ref = df["ReferenceAllele"].isna()
need_alt = df["AlternateAllele"].isna()
parsed = df.loc[need_ref | need_alt, "Name"].apply(parse_hgvs_alleles)
if len(parsed):
    pf = pd.DataFrame(list(parsed), index=parsed.index, columns=["_ref","_alt","_kind"])
    fill_ref = need_ref & pf["_ref"].notna()
    fill_alt = need_alt & pf["_alt"].notna()
    df.loc[fill_ref, "ReferenceAllele"] = pf.loc[fill_ref, "_ref"]
    df.loc[fill_alt, "AlternateAllele"] = pf.loc[fill_alt, "_alt"]
    df["_hgvs_kind"] = pf["_kind"]
else:
    df["_hgvs_kind"] = None

# ------------------------------------------------------
# 3) Validate alleles are A/C/G/T strings or a single '-'
# ------------------------------------------------------
valid = set("ACGT")
def clean_allele(x):
    if pd.isna(x): return np.nan
    s = str(x).upper()
    if s == "-": return s
    if all(ch in valid for ch in s):
        return s
    return np.nan  # drop weird tokens

df["ReferenceAllele"] = df["ReferenceAllele"].map(clean_allele)
df["AlternateAllele"] = df["AlternateAllele"].map(clean_allele)

# ----------------------------------------
# 4) Consistency + derived variant features
# ----------------------------------------
df["ref_len"] = df["ReferenceAllele"].str.len()
df["alt_len"] = df["AlternateAllele"].str.len()

# invalid cases
df["invalid_same"] = (df["ReferenceAllele"].notna() & (df["ReferenceAllele"] == df["AlternateAllele"]))
df["invalid_both_missing"] = df["ReferenceAllele"].isna() & df["AlternateAllele"].isna()

# shape flags
df["is_del"] = ((df["AlternateAllele"] == "-") & df["ReferenceAllele"].notna()).astype("int8")
df["is_ins"] = ((df["ReferenceAllele"] == "-") & df["AlternateAllele"].notna()).astype("int8")
df["is_snv"] = ((df["ref_len"] == 1) & (df["alt_len"] == 1)).fillna(False).astype("int8")
df["is_mnv"] = ((df["ref_len"] > 1) & (df["alt_len"] > 1)).fillna(False).astype("int8")
df["is_delins"] = ((df["_hgvs_kind"] == "DELINS") & df["AlternateAllele"].notna()).astype("int8")

# transitions only for SNV
transitions = {("A","G"),("G","A"),("C","T"),("T","C")}
df["is_transition"] = [
    int(snv and ( (r,a) in transitions ))
    for snv, r, a in zip(df["is_snv"], df["ReferenceAllele"], df["AlternateAllele"])
]

# indel length (0 for SNV/MNV)
df["indel_len"] = np.where(
    df["is_del"].astype(bool) | df["is_ins"].astype(bool),
    (df["ref_len"].fillna(0) - df["alt_len"].fillna(0)).abs(),
    0
).astype("int32")

# -------------------------------------------
# 5) Optional: functional flags from Name (p.)
# -------------------------------------------
# missense: p.Val574Ala, synonymous: p.Pro918=, nonsense: p.Tyr123*, frameshift: p.Lys10fs
is_missense_re   = re.compile(r"\bp\.[A-Z][a-z]{2}[0-9]+(?!TER|STOP|X)[A-Z][a-z]{2}\b", re.I)
is_syn_re       = re.compile(r"\bp\.[A-Z][a-z]{2}[0-9]+=")
is_nonsense_re   = re.compile(r"\bp\.[A-Z][a-z]{2}[0-9]+(\*|TER|STOP|X)\b", re.I)
is_frameshift_re= re.compile(r"\bp\.[A-Z][a-z]{2}[0-9]+fs", re.I)

# splice-proximal core (±1 or ±2)
splice_core_re  = re.compile(r"c\.[0-9]+([+-][12])", re.I)

df["is_missense"]   = df["Name"].astype(str).str.contains(is_missense_re).astype("int8")
df["is_synonymous"] = df["Name"].astype(str).str.contains(is_syn_re).astype("int8")
df["is_nonsense"]   = df["Name"].astype(str).str.contains(is_nonsense_re).astype("int8")
df["is_frameshift"] = df["Name"].astype(str).str.contains(is_frameshift_re).astype("int8")
# fix the splice regex warning (and robust to NaNs)
df["is_splice_core"] = df["Name"].astype(str).str.contains(splice_core_re, regex=True, na=False).astype("int8")
# compact SNV category (A>C etc.) for SNVs only
df["snv_pair"] = np.where(
    (df["is_snv"]==1) & df["ReferenceAllele"].notna() & df["AlternateAllele"].notna(),
    df["ReferenceAllele"].astype(str) + ">" + df["AlternateAllele"].astype(str),
    "NA"
)

# --------------------------------
# 6) Filter clearly invalid rows
# --------------------------------
before = len(df)
df = df[~df["invalid_both_missing"] & ~df["invalid_same"]].reset_index(drop=True)
after = len(df)

# --------------------------------
# 7) Diagnostics
# --------------------------------
print("Dropped rows (missing/same alleles):", before - after)
print("\nAlternateAllele top values:")
print(df["AlternateAllele"].value_counts(dropna=False).head(20))
print("\nReferenceAllele top values:")
print(df["ReferenceAllele"].value_counts(dropna=False).head(20))

print("\nVariant shape counts:")
print(pd.DataFrame({
    "SNV": df["is_snv"].sum(),
    "MNV": df["is_mnv"].sum(),
    "DEL": df["is_del"].sum(),
    "INS": df["is_ins"].sum(),
    "DELINS": df["is_delins"].sum()
}, index=["count"]).T)

print("\nFunctional flags (from Name):")
print(pd.DataFrame({
    "missense": df["is_missense"].sum(),
    "synonymous": df["is_synonymous"].sum(),
    "nonsense": df["is_nonsense"].sum(),
    "frameshift": df["is_frameshift"].sum(),
    "splice_core(±1/±2)": df["is_splice_core"].sum(),
}, index=["count"]).T)


  df["is_splice_core"] = df["Name"].astype(str).str.contains(splice_core_re, regex=True, na=False).astype("int8")


Dropped rows (missing/same alleles): 0

AlternateAllele top values:
T                       301849
A                       249546
G                       160356
C                       151767
-                          612
NaN                         59
CA                           4
CACAAAGTG                    2
AC                           2
GG                           2
TG                           2
TCCCGGGTTCAAGCGATTCT         2
TTTCCGACAAAGGT               2
CTA                          2
ATAAATCACTTAGAGATGT          2
AATTAAGGTATA                 2
GTGG                         2
TTGAA                        2
TGACATCAGTCCGGGCAC           2
TC                           2
Name: AlternateAllele, dtype: int64

ReferenceAllele top values:
C                              303972
G                              286320
T                              141362
A                              132075
NaN                               408
-                                  69
CAG                

In [23]:
df.head(10)

Unnamed: 0,Oncogenicity,Name,Type,OriginSimple,Chromosome,ReferenceAllele,AlternateAllele,y,ref_len,alt_len,...,_hgvs_kind,invalid_same,invalid_both_missing,is_delins,is_missense,is_synonymous,is_nonsense,is_frameshift,is_splice_core,snv_pair
0,Benign,NM_002645.4(PIK3C2A):c.1721T>C (p.Val574Ala),single nucleotide variant,germline,11,T,C,0,1.0,1.0,...,,False,False,0,1,0,0,0,0,T>C
1,Benign,NM_004082.5(DCTN1):c.2754C>T (p.Pro918=),single nucleotide variant,germline,2,C,T,0,1.0,1.0,...,,False,False,0,0,1,0,0,0,C>T
2,Benign,NM_017799.4(TMEM260):c.2036C>T (p.Pro679Leu),single nucleotide variant,germline,14,C,T,0,1.0,1.0,...,,False,False,0,1,0,0,0,0,C>T
3,Oncogenic,NM_000085.5(CLCNKB):c.1846-1G>A,single nucleotide variant,germline,1,G,A,1,1.0,1.0,...,,False,False,0,0,0,0,0,1,G>A
4,Benign,NM_000188.3(HK1):c.375+15C>T,single nucleotide variant,germline,10,C,T,0,1.0,1.0,...,,False,False,0,0,0,0,0,1,C>T
5,Oncogenic,NM_001042492.3(NF1):c.4892T>G (p.Leu1631Ter),single nucleotide variant,germline,17,T,G,1,1.0,1.0,...,,False,False,0,1,0,0,0,0,T>G
6,Benign,NM_021250.4(LILRA5):c.55G>A (p.Val19Met),single nucleotide variant,germline,19,G,A,0,1.0,1.0,...,,False,False,0,1,0,0,0,0,G>A
7,Benign,NM_003985.6(TNK1):c.1299G>A (p.Ser433=),single nucleotide variant,germline,17,G,A,0,1.0,1.0,...,,False,False,0,0,1,0,0,0,G>A
8,Benign,NM_001277115.2(DNAH11):c.11352C>T (p.Phe3784=),single nucleotide variant,germline,7,C,T,0,1.0,1.0,...,,False,False,0,0,1,0,0,0,C>T
9,Oncogenic,NM_001267550.2(TTN):c.79162G>T (p.Gly26388Ter),single nucleotide variant,germline,2,G,T,1,1.0,1.0,...,,False,False,0,1,0,0,0,0,G>T


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864224 entries, 0 to 864223
Data columns (total 26 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Oncogenicity          864224 non-null  object 
 1   Name                  864224 non-null  object 
 2   Type                  864224 non-null  object 
 3   OriginSimple          864224 non-null  object 
 4   Chromosome            864224 non-null  object 
 5   ReferenceAllele       863816 non-null  object 
 6   AlternateAllele       864165 non-null  object 
 7   y                     864224 non-null  int64  
 8   ref_len               863816 non-null  float64
 9   alt_len               864165 non-null  float64
 10  is_del                864224 non-null  int8   
 11  is_ins                864224 non-null  int8   
 12  is_snv                864224 non-null  int8   
 13  is_mnv                864224 non-null  int8   
 14  is_transition         864224 non-null  int64  
 15  

In [25]:
import os, json
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# features (no leakage)
cat_cols = ["Type","OriginSimple","Chromosome","snv_pair"]
num_cols = ["ref_len","alt_len","indel_len","is_del","is_ins","is_snv","is_mnv",
            "is_transition","is_missense","is_synonymous","is_nonsense","is_frameshift","is_splice_core"]

# tidy dtypes
for c in cat_cols:
    df[c] = df[c].astype("string")
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df[num_cols] = df[num_cols].fillna(0)

y = df["y"].astype(int).values

# vocab + z-stats
def build_vocab(series):
    uniq = ["UNK"] + sorted({str(x) for x in series.fillna("UNK")})
    return {v:i for i,v in enumerate(uniq)}

cat_maps = {c: build_vocab(df[c]) for c in cat_cols}
num_stats = {c: (float(df[c].mean()), float(df[c].std() or 1.0)) for c in num_cols}

# encoders
def encode_cats(frame):
    mats = []
    for c in cat_cols:
        vocab = cat_maps[c]
        mats.append(frame[c].fillna("UNK").astype(str).map(lambda v: vocab.get(v, 0)).astype("int64").values)
    return np.vstack(mats).T

def zscore_nums(frame):
    arrs = []
    for c in num_cols:
        m, s = num_stats[c]
        arrs.append(((frame[c].values - m) / (s if s!=0 else 1.0)).astype("float32"))
    return np.vstack(arrs).T

X_cat_all = encode_cats(df)
X_num_all = zscore_nums(df)
print("Shapes:", X_cat_all.shape, X_num_all.shape, y.shape)

# save artifacts for later deployment
os.makedirs("mutonco/artifacts", exist_ok=True)
with open("mutonco/artifacts/cat_maps.json","w",encoding="utf-8") as f: json.dump(cat_maps,f)
with open("mutonco/artifacts/num_stats.json","w",encoding="utf-8") as f: json.dump(num_stats,f)


Device: cuda
Shapes: (864224, 4) (864224, 13) (864224,)


In [76]:
# --- save to file ---
df.to_csv("clinvar_final2.tsv", sep="\t", index=False)
print("✅ Wrote balanced sample to clinvar_final2.tsv")

✅ Wrote balanced sample to clinvar_final2.tsv


In [47]:
df = df.drop(columns=["is_delins","is_frameshift","is_snv","is_mnv","ref_len","alt_len","indel_len","invalid_same","invalid_both_missing","Oncogenicity","is_del","is_ins"])


In [88]:
df = df.drop(columns=["is_del","is_ins"])


In [63]:
df.to_excel('C:/Users/tahas/clinvar_final2.xlsx', index=False)

In [93]:
print(df['is_missense'].value_counts())
print(df['is_nonsense'].value_counts())
print(df['is_splice_core'].value_counts())
print(df['snv_pair'].value_counts())
print(df['Type'].value_counts())
print(df['OriginSimple'].value_counts())
print(df['Chromosome'].value_counts())
print(df['ReferenceAllele'].value_counts())
print(df['AlternateAllele'].value_counts())
print(df['is_transition'].value_counts())

0    557785
1    279597
Name: is_missense, dtype: int64
0    837382
Name: is_nonsense, dtype: int64
0    650889
1    186493
Name: is_splice_core, dtype: int64
C>T    211437
G>A    177139
T>C     85970
A>G     84236
G>T     58263
C>G     44344
C>A     41067
G>C     40625
T>G     27561
T>A     23312
A>T     22309
A>C     21104
->T         4
->A         3
->C         3
->G         2
NA          2
C>-         1
Name: snv_pair, dtype: Int64
single nucleotide variant    837382
Name: Type, dtype: Int64
germline    837382
Name: OriginSimple, dtype: Int64
2     82640
1     72571
17    51384
11    48985
3     44758
X     43819
16    43414
19    41260
7     40225
5     39434
9     38186
12    37705
6     36233
15    31734
10    29219
8     27555
4     26623
14    25244
22    17356
13    16909
20    16800
18    14720
21    10272
Y       320
na       16
Name: Chromosome, dtype: Int64
C    296849
G    276027
T    136843
A    127649
-        14
Name: ReferenceAllele, dtype: int64
T       292013
A    

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 837382 entries, 0 to 837381
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Name             837382 non-null  object
 1   Type             837382 non-null  string
 2   OriginSimple     837382 non-null  string
 3   Chromosome       837382 non-null  string
 4   ReferenceAllele  837382 non-null  object
 5   AlternateAllele  837382 non-null  object
 6   y                837382 non-null  int64 
 7   is_transition    837382 non-null  int64 
 8   is_missense      837382 non-null  int8  
 9   is_nonsense      837382 non-null  int8  
 10  is_splice_core   837382 non-null  int8  
 11  snv_pair         837382 non-null  string
dtypes: int64(2), int8(3), object(3), string(4)
memory usage: 59.9+ MB


In [85]:
print(df['OriginSimple'].value_counts())
print(df['Type'].value_counts())

germline    837382
Name: OriginSimple, dtype: Int64
single nucleotide variant    837382
Name: Type, dtype: Int64


In [79]:
# احتفظ فقط بالسجلات التي OriginSimple فيها "germline" (مع تطبيع المسافات/الحروف)
mask = df["OriginSimple"].astype(str).str.strip().str.lower().eq("germline")
kept = int(mask.sum()); dropped = int(len(df) - kept)

df = df.loc[mask].reset_index(drop=True)

print(f"Kept {kept:,} germline rows; dropped {dropped:,} others.")
print(df["OriginSimple"].value_counts(dropna=False))



Kept 837,876 germline rows; dropped 26,348 others.
germline    837876
Name: OriginSimple, dtype: Int64


In [83]:
# احتفظ فقط بالسجلات التي OriginSimple فيها "germline" (مع تطبيع المسافات/الحروف)
mask = df["Type"].astype(str).str.strip().str.lower().eq("single nucleotide variant")
kept = int(mask.sum()); dropped = int(len(df) - kept)

df = df.loc[mask].reset_index(drop=True)

print(f"Kept {kept:,} germline rows; dropped {dropped:,} others.")
print(df["Type"].value_counts(dropna=False))


Kept 837,382 germline rows; dropped 494 others.
single nucleotide variant    837382
Name: Type, dtype: Int64
