In [1]:
import pandas as pd

# Load the file (handle gzip if it's compressed)
#df = pd.read_csv("/lustre/scratch126/casm/team274sb/lr26/T2T/Cosmic_MutantCensus_v99_T2T.liftover.tsv", sep='\t', dtype=str, low_memory=False)
df = pd.read_csv("/lustre/scratch126/casm/team274sb/lr26/T2T/Cosmic_MutantCensus_v99_hg38.tsv", sep='\t', dtype=str, low_memory=False)
# Show the first few rows
print(df.head())

# Show number of columns
#print("Number of columns:", len(df.rows))

# Show column names to see if any are merged
print("Columns:", df.columns.tolist())


  GENE_SYMBOL COSMIC_GENE_ID TRANSCRIPT_ACCESSION COSMIC_SAMPLE_ID  \
0       EP300     COSG106033    ENST00000263253.8      COSS2910551   
1       EP300     COSG106033    ENST00000263253.8      COSS1480254   
2       EP300     COSG106033    ENST00000263253.8      COSS1480254   
3       EP300     COSG106033    ENST00000263253.8      COSS1483802   
4       EP300     COSG106033    ENST00000263253.8      COSS1759437   

  SAMPLE_NAME COSMIC_PHENOTYPE_ID GENOMIC_MUTATION_ID LEGACY_MUTATION_ID  \
0      B00301        COSO30135332       COSV105839731       COSM10048954   
1     PD6100a        COSO27985146        COSV54325277        COSM4385252   
2     PD6100a        COSO27985146        COSV54325289        COSM4385257   
3     PD6272a        COSO27985048        COSV54340902        COSM4385251   
4     2334188        COSO29915333        COSV54325300          COSM97816   

  MUTATION_ID MUTATION_CDS  ... GENOME_STOP STRAND PUBMED_PMID  \
0   100225268    c.5350C>T  ...    41177061      +    33

In [3]:
import csv

# Open your TSV and VCF files
with open("/lustre/scratch126/casm/team274sb/lr26/T2T/Cosmic_MutantCensus_v99_hg38.tsv", 'r') as tsv_file, open('cosmic_hg38.vcf', 'w') as vcf_file:
    tsv_reader = csv.DictReader(tsv_file, delimiter='\t')    
    
    # Extract unique chromosomes (contigs)
    contigs = set()
    initial_missing_ref_alt = 0
    filtered_missing_ref_alt = 0
    missing_chromosome = 0
    
    # Write the VCF header
    vcf_file.write('##fileformat=VCFv4.2\n')
    vcf_file.write('##source=Cosmic\n')
    
    # Add contigs (chromosomes) to the header
    for row in tsv_reader:
        contigs.add(row['CHROMOSOME'])
        
        # Count initial missing REF/ALT alleles (before filtering)
        if not row['GENOMIC_WT_ALLELE'] or not row['GENOMIC_MUT_ALLELE']:
            initial_missing_ref_alt += 1
        
        # Check for missing CHROMOSOME
        if not row['CHROMOSOME']:
            missing_chromosome += 1
    
    # Rewind to the beginning of the file for the second pass
    tsv_file.seek(0)
    tsv_reader = csv.DictReader(tsv_file, delimiter='\t')
    
    # Write the contig and INFO annotations
    for contig in contigs:
        vcf_file.write(f'##contig=<ID={contig}>\n')
    
    # Create INFO annotations for all columns in the TSV
    for field in tsv_reader.fieldnames:
        vcf_file.write(f'##INFO=<ID={field},Number=1,Type=String,Description="{field}">\n')
    
    vcf_file.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n')
    
    # Write the VCF data rows
    for row in tsv_reader:
        # Skip rows with missing CHROMOSOME
        if not row['CHROMOSOME']:
            continue  # Skip the row
        
        ref_allele = row['GENOMIC_WT_ALLELE']
        alt_allele = row['GENOMIC_MUT_ALLELE']

        # Replace missing REF and ALT alleles with 'N'
        # Skip if both REF and ALT are missing
        if not ref_allele and not alt_allele:
            filtered_missing_ref_alt += 1
            continue

        # Handle missing REF or ALT individually
        if not ref_allele:
            filtered_missing_ref_alt += 1
            continue  # or optionally set to 'N'
        if not alt_allele:
            filtered_missing_ref_alt += 1
            continue  # or optionally set to 'N'

        # Optionally skip synonymous mutations where REF == ALT
        if ref_allele == alt_allele:
            continue
        
        chrom = row['CHROMOSOME']
        pos = row['GENOME_START']  # Assuming single base mutation; if not, use GENOME_STOP
        variant_id = row['MUTATION_ID']
        
        # Create the INFO field dynamically by including all columns from TSV
        info_fields = []
        for column in tsv_reader.fieldnames:
            # Add each column as a key-value pair in the INFO field
            info_fields.append(f"{column}={row[column]}")
        
        # Combine all fields into a single INFO string, separating by semicolons
        info = ';'.join(info_fields)
        
        # Write the VCF data line
        vcf_file.write(f'{chrom}\t{pos}\t{variant_id}\t{ref_allele}\t{alt_allele}\t.\t.\t{info}\n')
    
    # Print the counts of missing data
    print(f'Initial missing REF/ALT alleles (before filtering out CHROM): {initial_missing_ref_alt}')
    print(f'Filtered missing REF/ALT alleles (after filtering out CHROM): {filtered_missing_ref_alt}')
    print(f'Missing CHROMOSOME rows filtered out: {missing_chromosome}')



Initial missing REF/ALT alleles (before filtering out CHROM): 297613
Filtered missing REF/ALT alleles (after filtering out CHROM): 160355
Missing CHROMOSOME rows filtered out: 137258


In [4]:
import pandas as pd

# Load the VCF file into a DataFrame, skipping the metadata lines (those starting with '##')
column_names = [
    'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT' # Adjust with your column names
]

df = pd.read_csv('cosmic_hg38.vcf', sep='\t', comment='#', header=None, names=column_names)

df['CHROM'] = df['CHROM'].astype(str)
# Standardize chromosome names for T2T liftover (this is assuming you're working with T2T's naming conventions)
def standardize_chrom(chrom):
    if chrom == 'X':
        return 'chrX'
    elif chrom == 'Y':
        return 'chrY'
    elif chrom == 'MT':
        return 'chrMT'
    elif chrom.isdigit():  # For numeric chromosomes, add 'chr' prefix
        return f'chr{chrom}'
    else:
        return chrom  # Leave other non-standard names as is

# Apply the transformation to the 'CHROM' column
df['CHROM'] = df['CHROM'].apply(standardize_chrom)

# Verify the unique chromosome names after the transformation
unique_chromosomes = df['CHROM'].unique()

# Print the unique chromosome names after the conversion
print("Unique Chromosome Names After Standardization:")
for chrom in sorted(unique_chromosomes):
    print(chrom)


  df = pd.read_csv('cosmic_hg38.vcf', sep='\t', comment='#', header=None, names=column_names)


Unique Chromosome Names After Standardization:
chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr20
chr21
chr22
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrX
chrY


In [5]:
# Save the updated DataFrame to a new VCF file
df.to_csv('cosmic_chr_hg38.vcf', sep='\t', index=False, header=True)


In [29]:
import pandas as pd

# Define the column names based on the VCF format
column_names = [
    'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']

# Load the VCF file into a DataFrame, skipping metadata lines (those starting with '##')
df = pd.read_csv('cosmic_chr_hg38.vcf', sep='\t', comment='#', header=None, names=column_names)

# Check for non-integer values in the 'POS' column
df['POS'] = pd.to_numeric(df['POS'], errors='coerce')  # This will turn non-numeric values into NaN

# Filter out rows where 'POS' is NaN (i.e., invalid positions)
df = df.dropna(subset=['POS'])

# Convert 'POS' to integers after filtering
df['POS'] = df['POS'].astype(int)

# Optionally, save the cleaned VCF to a new file
df.to_csv('cleaned_cosmic_chr_hg38.vcf', sep='\t', index=False, header=True)

# Verify the cleaned DataFrame
print("Cleaned DataFrame:")
print(df.head())


  df = pd.read_csv('cosmic_chr_hg38.vcf', sep='\t', comment='#', header=None, names=column_names)


Cleaned DataFrame:
   CHROM       POS         ID REF ALT QUAL FILTER  \
1  chr22  41177061  100225268   C   T    .      .   
2  chr22  41158476  100218818   C   T    .      .   
3  chr22  41168840  100225274   G   A    .      .   
4  chr22  41155108  100225277   A   G    .      .   
5  chr22  41168849  100225279   G   T    .      .   

                                                INFO FORMAT  
1  GENE_SYMBOL=EP300;COSMIC_GENE_ID=COSG106033;TR...    NaN  
2  GENE_SYMBOL=EP300;COSMIC_GENE_ID=COSG106033;TR...    NaN  
3  GENE_SYMBOL=EP300;COSMIC_GENE_ID=COSG106033;TR...    NaN  
4  GENE_SYMBOL=EP300;COSMIC_GENE_ID=COSG106033;TR...    NaN  
5  GENE_SYMBOL=EP300;COSMIC_GENE_ID=COSG106033;TR...    NaN  


In [30]:
df

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
1,chr22,41177061,100225268,C,T,.,.,GENE_SYMBOL=EP300;COSMIC_GENE_ID=COSG106033;TR...,
2,chr22,41158476,100218818,C,T,.,.,GENE_SYMBOL=EP300;COSMIC_GENE_ID=COSG106033;TR...,
3,chr22,41168840,100225274,G,A,.,.,GENE_SYMBOL=EP300;COSMIC_GENE_ID=COSG106033;TR...,
4,chr22,41155108,100225277,A,G,.,.,GENE_SYMBOL=EP300;COSMIC_GENE_ID=COSG106033;TR...,
5,chr22,41168849,100225279,G,T,.,.,GENE_SYMBOL=EP300;COSMIC_GENE_ID=COSG106033;TR...,
...,...,...,...,...,...,...,...,...,...
1769470,chr7,140753336,183002032,A,T,.,.,GENE_SYMBOL=BRAF;COSMIC_GENE_ID=COSG99109;TRAN...,
1769471,chr7,140753336,183002032,A,T,.,.,GENE_SYMBOL=BRAF;COSMIC_GENE_ID=COSG99109;TRAN...,
1769472,chr7,140808039,183023301,T,C,.,.,GENE_SYMBOL=BRAF;COSMIC_GENE_ID=COSG99109;TRAN...,
1769473,chr7,140753336,183002032,A,T,.,.,GENE_SYMBOL=BRAF;COSMIC_GENE_ID=COSG99109;TRAN...,


In [21]:
import pandas as pd

# Load the file (handle gzip if it's compressed)
df = pd.read_csv("/lustre/scratch126/casm/team274sb/lr26/T2T/Cosmic_MutantCensus_v99_T2T.liftover.tsv", sep='\t', dtype=str, low_memory=False)

# Show the first few rows
print(df.head())

# Show number of columns
print("Number of columns:", len(df.columns))

# Show column names to see if any are merged
print("Columns:", df.columns.tolist())

# Load GFF3 file and extract gene-level coordinates
def load_gff3_coords(gff3_path):
    # Read GFF3 file and parse relevant columns
    df = pd.read_csv(gff3_path, sep='\t', comment='#', header=None)
    
    # Assign column names based on GFF3 format
    df.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
    
    # Filter to include only genes (feature = 'gene')
    df_genes = df[df['feature'] == 'gene']
    
    # Extract gene name from the attributes field (it's typically stored as 'gene=GENE_NAME')
    # Try multiple common attribute formats for gene names
    # Extract gene name more robustly, accounting for possible variations in the attribute field
    df_genes['gene_name'] = df_genes['attribute'].str.extract(r'(?:gene=|gene_name=|Name=|exon=)([^;]+)', expand=False).str.strip()

    
    # Return relevant columns (gene_name, seqname, start, end)
    return df_genes[['gene_name', 'seqname', 'start', 'end']]

# Load annotation from GFF3
gff3_gene_coords = load_gff3_coords("sorted_liftoff.gff3")

# Optionally convert start/end to integers
gff3_gene_coords["start"] = gff3_gene_coords["start"].astype(int)
gff3_gene_coords["end"] = gff3_gene_coords["end"].astype(int)

# Check if any chromosome names are missing in the mutation file
print("Rows with missing chromosome information:", df[df["CHROMOSOME"].isna()])

# Fix chromosome names by removing 'chr' prefix from both the DataFrame and the GFF3 annotations
df["CHROMOSOME"] = df["CHROMOSOME"].str.replace(r'^chr', '', regex=True)
gff3_gene_coords["seqname"] = gff3_gene_coords["seqname"].str.replace(r'^chr', '', regex=True)

# Merge your original DataFrame with the GFF3 annotations based on gene names
df_annotated = df.merge(gff3_gene_coords, left_on="GENE_SYMBOL", right_on="gene_name", how="left")

# Fill missing fields (like chromosome and genomic start/stop) with the merged data
df["CHROMOSOME"] = df["CHROMOSOME"].fillna(df_annotated["seqname"])
df["GENOME_START"] = df["GENOME_START"].fillna(df_annotated["start"])
df["GENOME_STOP"] = df["GENOME_STOP"].fillna(df_annotated["end"])

# Check how many rows still have missing chromosome information after merging
missing_chromosome = df[df['CHROMOSOME'].isna()]
print(f"Rows without matches (missing chromosome): {len(missing_chromosome)}")
print(missing_chromosome[['GENE_SYMBOL', 'CHROMOSOME', 'GENOME_START', 'GENOME_STOP']])

# Optionally drop helper columns (like 'gene_name') that were only used for merging
df_final = df.drop(columns=["gene_name"], errors="ignore")

# Optionally drop rows with missing critical info, if applicable
# Be careful with dropping rows, especially if you need to keep all rows with mutation data
df_final_cleaned = df_final.dropna(subset=["CHROMOSOME", "GENOME_START", "GENOME_STOP"])

# Preview the result
print(df_final_cleaned[["GENE_SYMBOL", "CHROMOSOME", "GENOME_START", "GENOME_STOP"]].head())




  GENE_SYMBOL COSMIC_GENE_ID TRANSCRIPT_ACCESSION COSMIC_SAMPLE_ID  \
0       EP300     COSG106033    ENST00000263253.8      COSS2910551   
1       EP300     COSG106033    ENST00000263253.8      COSS1480254   
2       EP300     COSG106033    ENST00000263253.8      COSS1480254   
3       EP300     COSG106033    ENST00000263253.8      COSS1483802   
4       EP300     COSG106033    ENST00000263253.8      COSS1759437   

  SAMPLE_NAME COSMIC_PHENOTYPE_ID GENOMIC_MUTATION_ID LEGACY_MUTATION_ID  \
0      B00301        COSO30135332       COSV105839731       COSM10048954   
1     PD6100a        COSO27985146        COSV54325277        COSM4385252   
2     PD6100a        COSO27985146        COSV54325289        COSM4385257   
3     PD6272a        COSO27985048        COSV54340902        COSM4385251   
4     2334188        COSO29915333        COSV54325300          COSM97816   

  MUTATION_ID MUTATION_CDS  ... GENOME_STOP STRAND PUBMED_PMID  \
0   100225268    c.5350C>T  ...    41177061      +    33

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genes['gene_name'] = df_genes['attribute'].str.extract(r'(?:gene=|gene_name=|Name=|exon=)([^;]+)', expand=False).str.strip()


Rows with missing chromosome information:         GENE_SYMBOL COSMIC_GENE_ID TRANSCRIPT_ACCESSION COSMIC_SAMPLE_ID  \
63968          MTOR      COSG77957    ENST00000361445.8      COSS2759210   
63969         MED12     COSG113013    ENST00000374080.7      COSS2780970   
63970          MTOR      COSG77957    ENST00000361445.8      COSS2860468   
63971         MED12     COSG113013    ENST00000374080.7      COSS2491708   
63972       SMARCD1      COSG71965    ENST00000394963.8      COSS2690120   
...             ...            ...                  ...              ...   
1892107        NPM1     COSG103724    ENST00000296930.9      COSS1669939   
1892108        BRAF      COSG99109    ENST00000646891.1      COSS2561829   
1892109        BRAF      COSG99109    ENST00000646891.1      COSS1045755   
1892110        BRAF      COSG99109    ENST00000646891.1      COSS2587770   
1892111        BRAF      COSG99109    ENST00000646891.1      COSS1687483   

        SAMPLE_NAME COSMIC_PHENOTYPE_ID GENOM

In [23]:
# If necessary, save the cleaned data to a new file
df_final_cleaned.to_csv("Cosmic_MutantCensus_v99_T2T.liftover.cleaned.tsv", sep='\t', index=False)

In [25]:
missing_genes = df_final_cleaned[df_final_cleaned['CHROMOSOME'].isna()]['GENE_SYMBOL'].unique()
print(f"Missing genes: {missing_genes[:10]}")  # Preview a few missing genes

Missing genes: []


In [None]:
print(df[["GENE_SYMBOL", "CHROMOSOME", "GENOME_START", "GENOME_STOP"]])

In [None]:
# Check how many rows have missing chromosome info after the merge
unmatched_rows = df_final[df_final['CHROMOSOME'].isna()]
print(f"Rows without matches (missing chromosome): {len(unmatched_rows)}")
# Preview some of the unmatched rows
print(unmatched_rows[['GENE_SYMBOL', 'CHROMOSOME', 'GENOME_START', 'GENOME_STOP']])

# Check which gene symbols are missing from the GFF3 annotation
missing_genes = df_annotated[df_annotated['seqname'].isna()]['GENE_SYMBOL'].unique()
print(f"Missing genes: {missing_genes[:10]}")  # Preview a few missing genes


In [None]:
print(df_final["CHROMOSOME"].unique())

In [None]:
df_final.to_csv("Cosmic_MutantCensus_v99_T2T.liftover.annotated.tsv", sep="\t", index=False)

In [27]:
import pandas as pd
import gzip

# Step 1: Define reference and file paths
reference = "T2T-CHM13v2.0.fa"
tsv_input = "Cosmic_MutantCensus_v99_T2T.liftover.cleaned.tsv"
vcf_output = "Cosmic_MutantCensus_v99_T2T.liftover.cleaned.vcf"

# Step 2: Read metadata headers from text files
with open("info_definitions.txt") as f:
    info_defs = f.read().splitlines()

with open("contig_definitions.txt") as f:
    contig_defs = f.read().splitlines()

# Step 3: Build VCF header
vcf_lines = [
    "##fileformat=VCFv4.2",
    f"##reference={reference}"
] + contig_defs + info_defs + [
    "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"
]

# Step 4: Load corrected data
df = pd.read_csv(tsv_input, sep="\t", dtype=str).fillna(".")
# Keep only rows with valid CHROMOSOME and GENOME_START
df = df[(df["CHROMOSOME"] != ".") & (df["GENOME_START"] != ".")]

# Step 5: Define the exact column names for INFO
info_fields = [
    'GENE_SYMBOL', 'COSMIC_GENE_ID', 'TRANSCRIPT_ACCESSION', 'COSMIC_SAMPLE_ID',
    'SAMPLE_NAME', 'COSMIC_PHENOTYPE_ID', 'GENOMIC_MUTATION_ID', 'LEGACY_MUTATION_ID',
    'MUTATION_ID', 'MUTATION_CDS', 'MUTATION_AA', 'MUTATION_DESCRIPTION',
    'MUTATION_ZYGOSITY', 'LOH', 'CHROMOSOME', 'GENOME_START', 'GENOME_STOP',
    'STRAND', 'PUBMED_PMID', 'COSMIC_STUDY_ID', 'HGVSP', 'HGVSC', 'HGVSG',
    'GENOMIC_WT_ALLELE', 'GENOMIC_MUT_ALLELE', 'MUTATION_SOMATIC_STATUS'
]

# Step 6: Generate VCF body lines
for _, row in df.iterrows():
    chrom = row["CHROMOSOME"]
    pos = int(float(row["GENOME_START"]))  # VCF position must be int
    vid = row["LEGACY_MUTATION_ID"]
    ref = row["GENOMIC_WT_ALLELE"]
    alt = row["GENOMIC_MUT_ALLELE"]

    # Fix the SAMPLE_NAME field by replacing semicolons with commas (only in the SAMPLE_NAME field)
    sample_name = row["SAMPLE_NAME"]
    if pd.notna(sample_name) and ";" in sample_name:
        sample_name = sample_name.replace(";", ",")  # Replace semicolons with commas in SAMPLE_NAME only

    # Only include non-null INFO fields
    info = [f"{col}={row[col]}" for col in info_fields if pd.notna(row[col]) and row[col] != "."]
    
    # Check if SAMPLE_NAME is already in info and replace it, otherwise append it
    info = [f"SAMPLE_NAME={sample_name}" if 'SAMPLE_NAME' in i.split('=')[0] else i for i in info]

    info_str = ";".join(info)

    vcf_line = f"{chrom}\t{pos}\t{vid}\t{ref}\t{alt}\t.\t.\t{info_str}"
    vcf_lines.append(vcf_line)

# Step 7: Write to plain VCF
with open(vcf_output, "w") as f:
    f.write("\n".join(vcf_lines) + "\n")

# Step 8: Compress VCF with gzip (optional)
with open(vcf_output, "rb") as f_in, gzip.open(vcf_output + ".gz", "wb") as f_out:
    f_out.writelines(f_in)



In [29]:
# Open the VCF file and fix the header line
with open("Cosmic_MutantCensus_v99_T2T.liftover.cleaned.vcf", "r") as file:
    lines = file.readlines()

# Open the file to write the corrected content
with open("Cosmic_MutantCensus_v99_T2T.liftover.cleaned.fixed.vcf", "w") as file:
    for line in lines:
        if line.startswith("#CHROM"):  # If it's the header line
            # Replace spaces with tabs (to be safe, use split and join to reassemble with tabs)
            fixed_line = "\t".join(line.split())  # This replaces all spaces with tabs
            file.write(fixed_line + "\n")
        else:
            file.write(line)

print("Header corrected and saved to new VCF file.")


Header corrected and saved to new VCF file.


In [3]:
import pandas as pd

In [2]:
df = pd.read_csv("/lustre/scratch126/casm/team274sb/lr26/pepper-tumor1B01/normalized_annotated_tumor_somatic_new_filtered.vcf",
                 sep="\t",
                 comment="#",
                 dtype="str",
                 header=None,
                 names=["chr","pos","id","ref","alt","qual","filter","info","format","sample"])
df

NameError: name 'pd' is not defined

In [41]:
print("INFO field of first variant:")
print(df.loc[0, "info"])
df_filtered = df.copy()

INFO field of first variant:
GENE_SYMBOL=SKI;COSMIC_GENE_ID=COSG101216;TRANSCRIPT_ACCESSION=ENST00000378536.4;COSMIC_SAMPLE_ID=COSS2634847;COSMIC_PHENOTYPE_ID=COSO28245255;GENOMIC_MUTATION_ID=COSV66015002;LEGACY_MUTATION_ID=COSN24852744;MUTATION_ID=118426552;MUTATION_CDS=c.970-13470_970-13469del;MUTATION_AA=p.?;MUTATION_DESCRIPTION=intron_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=1;GENOME_START=2289508;GENOME_STOP=2289509;STRAND=+;PUBMED_PMID;COSMIC_STUDY_ID=COSU379;HGVSP;HGVSC=ENST00000378536.4:c.970-13470_970-13469del;HGVSG=1:g.2289508_2289509del;GENOMIC_WT_ALLELE=GG;GENOMIC_MUT_ALLELE=N


In [42]:
def parse_sample_format(format_str, sample_str):
    """
    Parses FORMAT and SAMPLE fields of a VCF into a dictionary
    Handles numeric parsing and multi-allelic values robustly
    """
    keys = format_str.split(":")
    values = sample_str.split(":")
    sample_dict = dict(zip(keys, values))

    # Parse GT
    sample_dict["GT"] = sample_dict.get("GT", "")

    # Parse DP
    try:
        sample_dict["DP"] = int(sample_dict.get("DP", 0))
    except ValueError:
        sample_dict["DP"] = 0

    # Parse AD
    ad_val = sample_dict.get("AD", "0")
    if "," in ad_val:
        try:
            ad_list = [int(a) for a in ad_val.split(",") if a.isdigit()]
            sample_dict["AD"] = sum(ad_list[1:])  # Only ALT allele depth(s)
        except:
            sample_dict["AD"] = 0
    else:
        sample_dict["AD"] = int(ad_val) if ad_val.isdigit() else 0

    # Parse VAF (handle multiple values by taking max)
    vaf_val = sample_dict.get("VAF", "0.0")
    try:
        if "," in vaf_val:
            vaf_list = [float(v) for v in vaf_val.split(",") if v]
            sample_dict["VAF"] = max(vaf_list) if vaf_list else 0.0
        else:
            sample_dict["VAF"] = float(vaf_val)
    except:
        sample_dict["VAF"] = 0.0

    return sample_dict


def get_genotype_from_GT(gt_str):
    """
    Determines genotype type based on GT field
    """
    alleles = gt_str.replace('|', '/').split('/')
    if len(alleles) == 2:
        if alleles[0] == alleles[1]:
            return "homozygous"
        else:
            return "heterozygous"
    return "unknown"

def extract_gt(sample_str, format_str):
    keys = format_str.split(":")
    values = sample_str.split(":")
    format_dict = dict(zip(keys, values))
    gt = format_dict.get("GT", "")
    return get_genotype_from_GT(gt)

def filter_variant_by_sample(sample_dict, min_dp=10, min_ad=3, min_vaf=0.1):
    """
    Apply basic quality filters on depth, allele depth, and VAF
    """
    dp = sample_dict.get("DP", 0)
    ad = sample_dict.get("AD", 0)
    vaf = sample_dict.get("VAF", 0.0)
    
    return (dp >= min_dp) and (ad >= min_ad) and (vaf >= min_vaf)

def get_cosmic_annotation(info_str):
    """
    Parses a COSMIC-style INFO field and returns a dictionary of annotations.
    This assumes the INFO string is formatted as key=value;key=value;...
    """
    info_dict = {}
    for entry in info_str.split(";"):
        if "=" in entry:
            key, value = entry.split("=", 1)
            info_dict[key] = value
    return info_dict


def extract_gene(annotation):
    return annotation.get("GENE_SYMBOL")

def extract_protein_position(annotation):
    return annotation.get("MUTATION_AA")

def extract_mutation_description(annotation):
    return annotation.get("MUTATION_DESCRIPTION") or annotation.get("rsid")


In [43]:
# Extract sample info
df_filtered["sample_info"] = df_filtered.apply(lambda row: parse_sample_format(row["format"], row["sample"]), axis=1)

# Add specific fields as separate columns
df_filtered["DP"]  = df_filtered["sample_info"].apply(lambda x: x["DP"])
df_filtered["AD"]  = df_filtered["sample_info"].apply(lambda x: x["AD"])
df_filtered["VAF"] = df_filtered["sample_info"].apply(lambda x: x["VAF"])
df_filtered["GT"]  = df_filtered["sample_info"].apply(lambda x: x["GT"])
df_filtered["genotype"] = df_filtered["GT"].apply(get_genotype_from_GT)

# Apply filtering
df_filtered["pass_sample_filters"] = df_filtered["sample_info"].apply(filter_variant_by_sample)

# Extract VEP annotations
df_filtered["cosmic_ann"] = df_filtered["info"].apply(get_cosmic_annotation)

df_filtered["gene"] = df_filtered["cosmic_ann"].apply(extract_gene)
df_filtered["protein_position"] = df_filtered["cosmic_ann"].apply(extract_protein_position)
df_filtered["mutation_description"] = df_filtered["cosmic_ann"].apply(extract_mutation_description)


In [44]:
df_filtered

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
0,chr1,1725425,.,TG,T,0,refCall,GENE_SYMBOL=SKI;COSMIC_GENE_ID=COSG101216;TRAN...,GT:GQ:DP:AD:VAF:C,0/0:29:25:4:0.16:DV,...,25,4,0.160000,0/0,homozygous,True,"{'GENE_SYMBOL': 'SKI', 'COSMIC_GENE_ID': 'COSG...",SKI,p.?,intron_variant
1,chr1,2805598,.,G,A,0.7,refCall,GENE_SYMBOL=PRDM16;COSMIC_GENE_ID=COSG80881;TR...,GT:GQ:DP:AD:VAF:C,./.:8:5:2:0.4:DV,...,5,2,0.400000,./.,homozygous,False,"{'GENE_SYMBOL': 'PRDM16', 'COSMIC_GENE_ID': 'C...",PRDM16,p.?,intron_variant
2,chr1,2881539,.,TC,T,0,refCall,GENE_SYMBOL=PRDM16;COSMIC_GENE_ID=COSG80881;TR...,GT:GQ:DP:AD:VAF:C,0/0:55:24:3:0.125:DV,...,24,3,0.125000,0/0,homozygous,True,"{'GENE_SYMBOL': 'PRDM16', 'COSMIC_GENE_ID': 'C...",PRDM16,p.?,intron_variant
3,chr1,7079705,.,A,G,0.1,refCall,GENE_SYMBOL=CAMTA1;COSMIC_GENE_ID=COSG61199;TR...,GT:GQ:DP:AD:VAF:C,./.:17:2:2:1:DV,...,2,2,1.000000,./.,homozygous,False,"{'GENE_SYMBOL': 'CAMTA1', 'COSMIC_GENE_ID': 'C...",CAMTA1,p.?,intron_variant
4,chr1,7125264,.,G,A,0,refCall,GENE_SYMBOL=CAMTA1;COSMIC_GENE_ID=COSG61199;TR...,GT:GQ:DP:AD:VAF:C,0/0:29:22:3:0.136364:DV,...,22,3,0.136364,0/0,homozygous,True,"{'GENE_SYMBOL': 'CAMTA1', 'COSMIC_GENE_ID': 'C...",CAMTA1,p.?,intron_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,chr9,96931334,.,CT,C,0,refCall,GENE_SYMBOL=NTRK2;COSMIC_GENE_ID=COSG88009;TRA...,GT:GQ:DP:AD:VAF:C,0/0:50:26:3:0.115385:DV,...,26,3,0.115385,0/0,homozygous,True,"{'GENE_SYMBOL': 'NTRK2', 'COSMIC_GENE_ID': 'CO...",NTRK2,p.?,intron_variant
277,chr9,142100533,.,GT,G,0,refCall,GENE_SYMBOL=FNBP1;COSMIC_GENE_ID=COSG106586;TR...,GT:GQ:DP:AD:VAF:C,0/0:53:31:4:0.129032:DV,...,31,4,0.129032,0/0,homozygous,True,"{'GENE_SYMBOL': 'FNBP1', 'COSMIC_GENE_ID': 'CO...",FNBP1,p.?,intron_variant
278,chr9,143086761,.,C,A,0,refCall,GENE_SYMBOL=ABL1;COSMIC_GENE_ID=COSG106650;TRA...,GT:GQ:DP:AD:VAF:C,0/0:46:19:3:0.157895:DV,...,19,3,0.157895,0/0,homozygous,True,"{'GENE_SYMBOL': 'ABL1', 'COSMIC_GENE_ID': 'COS...",ABL1,p.?,intron_variant
279,chrX,66015092,.,CA,C,0,refCall,GENE_SYMBOL=AR;COSMIC_GENE_ID=COSG86691;TRANSC...,GT:GQ:DP:AD:VAF:C,0/0:21:29:7:0.241379:DV,...,29,7,0.241379,0/0,homozygous,True,"{'GENE_SYMBOL': 'AR', 'COSMIC_GENE_ID': 'COSG8...",AR,p.?,intron_variant


In [45]:
df_intronless = df_filtered[df_filtered["mutation_description"] != "intron_variant"]
df_intronless

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
8,chr1,147188363,.,A,AG,0,refCall,GENE_SYMBOL=BCL9;COSMIC_GENE_ID=COSG114870;TRA...,GT:GQ:DP:AD:VAF:C,0/0:60:34:4:0.117647:DV,...,34,4,0.117647,0/0,homozygous,True,"{'GENE_SYMBOL': 'BCL9', 'COSMIC_GENE_ID': 'COS...",BCL9,p.P517Lfs*5,
35,chr12,27548271,.,CT,C,0,refCall,GENE_SYMBOL=PPFIBP1;COSMIC_GENE_ID=COSG71462;T...,GT:GQ:DP:AD:VAF:C,0/0:60:27:3:0.111111:DV,...,27,3,0.111111,0/0,homozygous,True,"{'GENE_SYMBOL': 'PPFIBP1', 'COSMIC_GENE_ID': '...",PPFIBP1,p.F530*,
44,chr13,47676495,.,GA,G,0,refCall,GENE_SYMBOL=RB1;COSMIC_GENE_ID=COSG77480;TRANS...,GT:GQ:DP:AD:VAF:C,0/0:59:22:3:0.136364:DV,...,22,3,0.136364,0/0,homozygous,True,"{'GENE_SYMBOL': 'RB1', 'COSMIC_GENE_ID': 'COSG...",RB1,p.K616Rfs*36,
55,chr16,9797423,.,G,A,22,PASS,GENE_SYMBOL=GRIN2A;COSMIC_GENE_ID=COSG108203;T...,GT:GQ:DP:AD:VAF:C,0/1:22:37:17:0.459:P,...,37,17,0.459000,0/1,heterozygous,True,"{'GENE_SYMBOL': 'GRIN2A', 'COSMIC_GENE_ID': 'C...",GRIN2A,p.T1212M,missense_variant
67,chr17,32320390,.,AT,A,0,refCall,GENE_SYMBOL=NF1;COSMIC_GENE_ID=COSG113820;TRAN...,GT:GQ:DP:AD:VAF:C,0/0:47:31:4:0.129032:DV,...,31,4,0.129032,0/0,homozygous,True,"{'GENE_SYMBOL': 'NF1', 'COSMIC_GENE_ID': 'COSG...",NF1,p.?,3_prime_UTR_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,chr3,198586274,.,C,T,0,refCall,GENE_SYMBOL=MUC4;COSMIC_GENE_ID=COSG97438;TRAN...,GT:GQ:DP:AD:VAF:C,0/0:29:13:2:0.153846:DV,...,13,2,0.153846,0/0,homozygous,False,"{'GENE_SYMBOL': 'MUC4', 'COSMIC_GENE_ID': 'COS...",MUC4,p.P4176S,missense_variant
239,chr6,160011545,.,C,A,1.2,refCall,GENE_SYMBOL=EZR;COSMIC_GENE_ID=COSG78273;TRANS...,GT:GQ:DP:AD:VAF:C,./.:6:16:2:0.125:DV,...,16,2,0.125000,./.,homozygous,False,"{'GENE_SYMBOL': 'EZR', 'COSMIC_GENE_ID': 'COSG...",EZR,p.?,3_prime_UTR_variant
246,chr7,88114554,.,A,C,0.4,refCall,GENE_SYMBOL=GRM3;COSMIC_GENE_ID=COSG74476;TRAN...,GT:GQ:DP:AD:VAF:C,./.:11:23:3:0.130435:DV,...,23,3,0.130435,./.,homozygous,True,"{'GENE_SYMBOL': 'GRM3', 'COSMIC_GENE_ID': 'COS...",GRM3,p.?,3_prime_UTR_variant
270,chr8,117984445,.,TA,T,30.3,PASS,GENE_SYMBOL=RAD21;COSMIC_GENE_ID=COSG63146;TRA...,GT:GQ:DP:AD:VAF:C,1/1:3:40:27:0.675:DV,...,40,27,0.675000,1/1,homozygous,True,"{'GENE_SYMBOL': 'RAD21', 'COSMIC_GENE_ID': 'CO...",RAD21,p.?,"intron_variant,splice_region_variant"


In [46]:
df_syn = df_intronless[df_intronless["mutation_description"] != "synonymous_variant"]
df_syn

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
8,chr1,147188363,.,A,AG,0,refCall,GENE_SYMBOL=BCL9;COSMIC_GENE_ID=COSG114870;TRA...,GT:GQ:DP:AD:VAF:C,0/0:60:34:4:0.117647:DV,...,34,4,0.117647,0/0,homozygous,True,"{'GENE_SYMBOL': 'BCL9', 'COSMIC_GENE_ID': 'COS...",BCL9,p.P517Lfs*5,
35,chr12,27548271,.,CT,C,0,refCall,GENE_SYMBOL=PPFIBP1;COSMIC_GENE_ID=COSG71462;T...,GT:GQ:DP:AD:VAF:C,0/0:60:27:3:0.111111:DV,...,27,3,0.111111,0/0,homozygous,True,"{'GENE_SYMBOL': 'PPFIBP1', 'COSMIC_GENE_ID': '...",PPFIBP1,p.F530*,
44,chr13,47676495,.,GA,G,0,refCall,GENE_SYMBOL=RB1;COSMIC_GENE_ID=COSG77480;TRANS...,GT:GQ:DP:AD:VAF:C,0/0:59:22:3:0.136364:DV,...,22,3,0.136364,0/0,homozygous,True,"{'GENE_SYMBOL': 'RB1', 'COSMIC_GENE_ID': 'COSG...",RB1,p.K616Rfs*36,
55,chr16,9797423,.,G,A,22,PASS,GENE_SYMBOL=GRIN2A;COSMIC_GENE_ID=COSG108203;T...,GT:GQ:DP:AD:VAF:C,0/1:22:37:17:0.459:P,...,37,17,0.459000,0/1,heterozygous,True,"{'GENE_SYMBOL': 'GRIN2A', 'COSMIC_GENE_ID': 'C...",GRIN2A,p.T1212M,missense_variant
67,chr17,32320390,.,AT,A,0,refCall,GENE_SYMBOL=NF1;COSMIC_GENE_ID=COSG113820;TRAN...,GT:GQ:DP:AD:VAF:C,0/0:47:31:4:0.129032:DV,...,31,4,0.129032,0/0,homozygous,True,"{'GENE_SYMBOL': 'NF1', 'COSMIC_GENE_ID': 'COSG...",NF1,p.?,3_prime_UTR_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,chr3,198586274,.,C,T,0,refCall,GENE_SYMBOL=MUC4;COSMIC_GENE_ID=COSG97438;TRAN...,GT:GQ:DP:AD:VAF:C,0/0:29:13:2:0.153846:DV,...,13,2,0.153846,0/0,homozygous,False,"{'GENE_SYMBOL': 'MUC4', 'COSMIC_GENE_ID': 'COS...",MUC4,p.P4176S,missense_variant
239,chr6,160011545,.,C,A,1.2,refCall,GENE_SYMBOL=EZR;COSMIC_GENE_ID=COSG78273;TRANS...,GT:GQ:DP:AD:VAF:C,./.:6:16:2:0.125:DV,...,16,2,0.125000,./.,homozygous,False,"{'GENE_SYMBOL': 'EZR', 'COSMIC_GENE_ID': 'COSG...",EZR,p.?,3_prime_UTR_variant
246,chr7,88114554,.,A,C,0.4,refCall,GENE_SYMBOL=GRM3;COSMIC_GENE_ID=COSG74476;TRAN...,GT:GQ:DP:AD:VAF:C,./.:11:23:3:0.130435:DV,...,23,3,0.130435,./.,homozygous,True,"{'GENE_SYMBOL': 'GRM3', 'COSMIC_GENE_ID': 'COS...",GRM3,p.?,3_prime_UTR_variant
270,chr8,117984445,.,TA,T,30.3,PASS,GENE_SYMBOL=RAD21;COSMIC_GENE_ID=COSG63146;TRA...,GT:GQ:DP:AD:VAF:C,1/1:3:40:27:0.675:DV,...,40,27,0.675000,1/1,homozygous,True,"{'GENE_SYMBOL': 'RAD21', 'COSMIC_GENE_ID': 'CO...",RAD21,p.?,"intron_variant,splice_region_variant"


In [47]:
df3 = df_syn[df_syn["mutation_description"] != "3_prime_UTR_variant"]
df3

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
8,chr1,147188363,.,A,AG,0,refCall,GENE_SYMBOL=BCL9;COSMIC_GENE_ID=COSG114870;TRA...,GT:GQ:DP:AD:VAF:C,0/0:60:34:4:0.117647:DV,...,34,4,0.117647,0/0,homozygous,True,"{'GENE_SYMBOL': 'BCL9', 'COSMIC_GENE_ID': 'COS...",BCL9,p.P517Lfs*5,
35,chr12,27548271,.,CT,C,0,refCall,GENE_SYMBOL=PPFIBP1;COSMIC_GENE_ID=COSG71462;T...,GT:GQ:DP:AD:VAF:C,0/0:60:27:3:0.111111:DV,...,27,3,0.111111,0/0,homozygous,True,"{'GENE_SYMBOL': 'PPFIBP1', 'COSMIC_GENE_ID': '...",PPFIBP1,p.F530*,
44,chr13,47676495,.,GA,G,0,refCall,GENE_SYMBOL=RB1;COSMIC_GENE_ID=COSG77480;TRANS...,GT:GQ:DP:AD:VAF:C,0/0:59:22:3:0.136364:DV,...,22,3,0.136364,0/0,homozygous,True,"{'GENE_SYMBOL': 'RB1', 'COSMIC_GENE_ID': 'COSG...",RB1,p.K616Rfs*36,
55,chr16,9797423,.,G,A,22,PASS,GENE_SYMBOL=GRIN2A;COSMIC_GENE_ID=COSG108203;T...,GT:GQ:DP:AD:VAF:C,0/1:22:37:17:0.459:P,...,37,17,0.459000,0/1,heterozygous,True,"{'GENE_SYMBOL': 'GRIN2A', 'COSMIC_GENE_ID': 'C...",GRIN2A,p.T1212M,missense_variant
70,chr17,59225675,.,AC,A,0.4,refCall,GENE_SYMBOL=RNF43;COSMIC_GENE_ID=COSG77343;TRA...,GT:GQ:DP:AD:VAF:C,./.:10:25:4:0.16:DV,...,25,4,0.160000,./.,homozygous,True,"{'GENE_SYMBOL': 'RNF43', 'COSMIC_GENE_ID': 'CO...",RNF43,p.G659Afs*87,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,chr3,198586146,.,G,A,0,refCall,GENE_SYMBOL=MUC4;COSMIC_GENE_ID=COSG97438;TRAN...,GT:GQ:DP:AD:VAF:C,0/0:32:14:2:0.142857:DV,...,14,2,0.142857,0/0,homozygous,False,"{'GENE_SYMBOL': 'MUC4', 'COSMIC_GENE_ID': 'COS...",MUC4,p.S4133N,missense_variant
214,chr3,198586217,.,C,G,0,refCall,GENE_SYMBOL=MUC4;COSMIC_GENE_ID=COSG97438;TRAN...,GT:GQ:DP:AD:VAF:C,0/0:37:14:2:0.142857:DV,...,14,2,0.142857,0/0,homozygous,False,"{'GENE_SYMBOL': 'MUC4', 'COSMIC_GENE_ID': 'COS...",MUC4,p.H4157D,missense_variant
215,chr3,198586274,.,C,T,0,refCall,GENE_SYMBOL=MUC4;COSMIC_GENE_ID=COSG97438;TRAN...,GT:GQ:DP:AD:VAF:C,0/0:29:13:2:0.153846:DV,...,13,2,0.153846,0/0,homozygous,False,"{'GENE_SYMBOL': 'MUC4', 'COSMIC_GENE_ID': 'COS...",MUC4,p.P4176S,missense_variant
270,chr8,117984445,.,TA,T,30.3,PASS,GENE_SYMBOL=RAD21;COSMIC_GENE_ID=COSG63146;TRA...,GT:GQ:DP:AD:VAF:C,1/1:3:40:27:0.675:DV,...,40,27,0.675000,1/1,homozygous,True,"{'GENE_SYMBOL': 'RAD21', 'COSMIC_GENE_ID': 'CO...",RAD21,p.?,"intron_variant,splice_region_variant"


In [48]:
df5 = df3[df3["mutation_description"] != "5_prime_UTR_variant"]
df5

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
8,chr1,147188363,.,A,AG,0,refCall,GENE_SYMBOL=BCL9;COSMIC_GENE_ID=COSG114870;TRA...,GT:GQ:DP:AD:VAF:C,0/0:60:34:4:0.117647:DV,...,34,4,0.117647,0/0,homozygous,True,"{'GENE_SYMBOL': 'BCL9', 'COSMIC_GENE_ID': 'COS...",BCL9,p.P517Lfs*5,
35,chr12,27548271,.,CT,C,0,refCall,GENE_SYMBOL=PPFIBP1;COSMIC_GENE_ID=COSG71462;T...,GT:GQ:DP:AD:VAF:C,0/0:60:27:3:0.111111:DV,...,27,3,0.111111,0/0,homozygous,True,"{'GENE_SYMBOL': 'PPFIBP1', 'COSMIC_GENE_ID': '...",PPFIBP1,p.F530*,
44,chr13,47676495,.,GA,G,0,refCall,GENE_SYMBOL=RB1;COSMIC_GENE_ID=COSG77480;TRANS...,GT:GQ:DP:AD:VAF:C,0/0:59:22:3:0.136364:DV,...,22,3,0.136364,0/0,homozygous,True,"{'GENE_SYMBOL': 'RB1', 'COSMIC_GENE_ID': 'COSG...",RB1,p.K616Rfs*36,
55,chr16,9797423,.,G,A,22,PASS,GENE_SYMBOL=GRIN2A;COSMIC_GENE_ID=COSG108203;T...,GT:GQ:DP:AD:VAF:C,0/1:22:37:17:0.459:P,...,37,17,0.459000,0/1,heterozygous,True,"{'GENE_SYMBOL': 'GRIN2A', 'COSMIC_GENE_ID': 'C...",GRIN2A,p.T1212M,missense_variant
70,chr17,59225675,.,AC,A,0.4,refCall,GENE_SYMBOL=RNF43;COSMIC_GENE_ID=COSG77343;TRA...,GT:GQ:DP:AD:VAF:C,./.:10:25:4:0.16:DV,...,25,4,0.160000,./.,homozygous,True,"{'GENE_SYMBOL': 'RNF43', 'COSMIC_GENE_ID': 'CO...",RNF43,p.G659Afs*87,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,chr3,198586146,.,G,A,0,refCall,GENE_SYMBOL=MUC4;COSMIC_GENE_ID=COSG97438;TRAN...,GT:GQ:DP:AD:VAF:C,0/0:32:14:2:0.142857:DV,...,14,2,0.142857,0/0,homozygous,False,"{'GENE_SYMBOL': 'MUC4', 'COSMIC_GENE_ID': 'COS...",MUC4,p.S4133N,missense_variant
214,chr3,198586217,.,C,G,0,refCall,GENE_SYMBOL=MUC4;COSMIC_GENE_ID=COSG97438;TRAN...,GT:GQ:DP:AD:VAF:C,0/0:37:14:2:0.142857:DV,...,14,2,0.142857,0/0,homozygous,False,"{'GENE_SYMBOL': 'MUC4', 'COSMIC_GENE_ID': 'COS...",MUC4,p.H4157D,missense_variant
215,chr3,198586274,.,C,T,0,refCall,GENE_SYMBOL=MUC4;COSMIC_GENE_ID=COSG97438;TRAN...,GT:GQ:DP:AD:VAF:C,0/0:29:13:2:0.153846:DV,...,13,2,0.153846,0/0,homozygous,False,"{'GENE_SYMBOL': 'MUC4', 'COSMIC_GENE_ID': 'COS...",MUC4,p.P4176S,missense_variant
270,chr8,117984445,.,TA,T,30.3,PASS,GENE_SYMBOL=RAD21;COSMIC_GENE_ID=COSG63146;TRA...,GT:GQ:DP:AD:VAF:C,1/1:3:40:27:0.675:DV,...,40,27,0.675000,1/1,homozygous,True,"{'GENE_SYMBOL': 'RAD21', 'COSMIC_GENE_ID': 'CO...",RAD21,p.?,"intron_variant,splice_region_variant"


In [49]:
df5["qual"] = pd.to_numeric(df5["qual"]).astype(int)

df_fin = df5[
    (df5["filter"] == "PASS") &
    (df5["qual"] > 5) &
    (df5["AD"] > 5) &
    (df5["mutation_description"].notna())
]

df_fin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5["qual"] = pd.to_numeric(df5["qual"]).astype(int)


Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
55,chr16,9797423,.,G,A,22,PASS,GENE_SYMBOL=GRIN2A;COSMIC_GENE_ID=COSG108203;T...,GT:GQ:DP:AD:VAF:C,0/1:22:37:17:0.459:P,...,37,17,0.459,0/1,heterozygous,True,"{'GENE_SYMBOL': 'GRIN2A', 'COSMIC_GENE_ID': 'C...",GRIN2A,p.T1212M,missense_variant
270,chr8,117984445,.,TA,T,30,PASS,GENE_SYMBOL=RAD21;COSMIC_GENE_ID=COSG63146;TRA...,GT:GQ:DP:AD:VAF:C,1/1:3:40:27:0.675:DV,...,40,27,0.675,1/1,homozygous,True,"{'GENE_SYMBOL': 'RAD21', 'COSMIC_GENE_ID': 'CO...",RAD21,p.?,"intron_variant,splice_region_variant"


In [50]:
print(df_fin["mutation_description"].unique())
print(df_fin["gene"].unique())

['missense_variant' 'intron_variant,splice_region_variant']
['GRIN2A' 'RAD21']


In [51]:
df_fin.to_csv("../pepper-tumor1B01/normalized_annotated_tumor_somatic_new_filtered_mutated.vcf", sep="\t")

In [2]:
import pandas as pd
df = pd.read_csv("../pepper-tumor1B01/normalized_annotated_shared_germline_new_filtered_mutated.vcf",
                 sep="\t",
                 comment="#",
                 dtype="str",
                 header=None,
                 names=["no", "chr","pos","id","ref","alt","qual","filter","info","format","sample"])
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,no,chr,pos,id,ref,alt,qual,filter,info,format,sample
,chr,pos,id,ref,alt,qual,filter,info,format,sample,sample_info,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
92,chr1,14950750,.,T,C,23,PASS,GENE_SYMBOL=CASP9;COSMIC_GENE_ID=COSG102681;TRANSCRIPT_ACCESSION=ENST00000333868.9;COSMIC_SAMPLE_ID=COSS2785973;COSMIC_PHENOTYPE_ID=COSO36004837;GENOMIC_MUTATION_ID=COSV61600812;LEGACY_MUTATION_ID=COSM6281180;MUTATION_ID=110430602;MUTATION_CDS=c.662A>G;MUTATION_AA=p.Q221R;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=1;GENOME_START=15506048;GENOME_STOP=15506048;STRAND=-;PUBMED_PMID=27175599;COSMIC_STUDY_ID;HGVSP=ENSP00000330237.5:p.Gln221Arg;HGVSC=ENST00000333868.9:c.662A>G;HGVSG=1:g.15506048T>C;GENOMIC_WT_ALLELE=T;GENOMIC_MUT_ALLELE=C,GT:GQ:DP:AD:VAF:C,1/1:23:28:28:1:P,"{'GT': '1/1', 'GQ': '23', 'DP': 28, 'AD': 28, ...",28,28,1.0,1/1,homozygous,True,"{'GENE_SYMBOL': 'CASP9', 'COSMIC_GENE_ID': 'CO...",CASP9,p.Q221R,missense_variant
97,chr1,14968816,.,G,A,25,PASS,GENE_SYMBOL=CASP9;COSMIC_GENE_ID=COSG102681;TRANSCRIPT_ACCESSION=ENST00000333868.9;COSMIC_SAMPLE_ID=COSS2296299;COSMIC_PHENOTYPE_ID=COSO29324830;GENOMIC_MUTATION_ID=COSV61600760;LEGACY_MUTATION_ID=COSM3750476;MUTATION_ID=110430846;MUTATION_CDS=c.83C>T;MUTATION_AA=p.A28V;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=1;GENOME_START=15524118;GENOME_STOP=15524118;STRAND=-;PUBMED_PMID=25275298;COSMIC_STUDY_ID;HGVSP=ENSP00000330237.5:p.Ala28Val;HGVSC=ENST00000333868.9:c.83C>T;HGVSG=1:g.15524118G>A;GENOMIC_WT_ALLELE=G;GENOMIC_MUT_ALLELE=A,GT:GQ:DP:AD:VAF:C,1/1:25:42:42:1:P,"{'GT': '1/1', 'GQ': '25', 'DP': 42, 'AD': 42, ...",42,42,1.0,1/1,homozygous,True,"{'GENE_SYMBOL': 'CASP9', 'COSMIC_GENE_ID': 'CO...",CASP9,p.A28V,missense_variant
126,chr1,36149695,.,C,T,24,PASS,GENE_SYMBOL=THRAP3;COSMIC_GENE_ID=COSG77463;TRANSCRIPT_ACCESSION=ENST00000354618.9;COSMIC_SAMPLE_ID=COSS2955773;COSMIC_PHENOTYPE_ID=COSO36605381;GENOMIC_MUTATION_ID=COSV100663623;LEGACY_MUTATION_ID=COSM9180497;MUTATION_ID=113249093;MUTATION_CDS=c.602C>T;MUTATION_AA=p.A201V;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=1;GENOME_START=36286832;GENOME_STOP=36286832;STRAND=+;PUBMED_PMID=31636198;COSMIC_STUDY_ID;HGVSP=ENSP00000346634.5:p.Ala201Val;HGVSC=ENST00000354618.9:c.602C>T;HGVSG=1:g.36286832C>T;GENOMIC_WT_ALLELE=C;GENOMIC_MUT_ALLELE=T,GT:GQ:DP:AD:VAF:C,0/1:24:35:16:0.457:P,"{'GT': '0/1', 'GQ': '24', 'DP': 35, 'AD': 16, ...",35,16,0.457,0/1,heterozygous,True,"{'GENE_SYMBOL': 'THRAP3', 'COSMIC_GENE_ID': 'C...",THRAP3,p.A201V,missense_variant
132,chr1,47138842,.,T,C,19,PASS,GENE_SYMBOL=STIL;COSMIC_GENE_ID=COSG80911;TRANSCRIPT_ACCESSION=ENST00000371877.7;COSMIC_SAMPLE_ID=COSS2185970;COSMIC_PHENOTYPE_ID=COSO36284888;GENOMIC_MUTATION_ID=COSV54551803;LEGACY_MUTATION_ID=COSM4144058;MUTATION_ID=118445599;MUTATION_CDS=c.2954A>G;MUTATION_AA=p.H985R;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=1;GENOME_START=47260415;GENOME_STOP=47260415;STRAND=-;PUBMED_PMID;COSMIC_STUDY_ID=COSU589;HGVSP=ENSP00000360944.3:p.His985Arg;HGVSC=ENST00000371877.7:c.2954A>G;HGVSG=1:g.47260415T>C;GENOMIC_WT_ALLELE=T;GENOMIC_MUT_ALLELE=C,GT:GQ:DP:AD:VAF:C,0/1:19:28:14:0.5:P,"{'GT': '0/1', 'GQ': '19', 'DP': 28, 'AD': 14, ...",28,14,0.5,0/1,heterozygous,True,"{'GENE_SYMBOL': 'STIL', 'COSMIC_GENE_ID': 'COS...",STIL,p.H985R,missense_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4424,chr9,127266169,.,C,T,23,PASS,GENE_SYMBOL=TNC;COSMIC_GENE_ID=COSG105617;TRANSCRIPT_ACCESSION=ENST00000350763.8;COSMIC_SAMPLE_ID=COSS2955809;COSMIC_PHENOTYPE_ID=COSO36605381;GENOMIC_MUTATION_ID=COSV60785681;LEGACY_MUTATION_ID=COSM5009671;MUTATION_ID=113141353;MUTATION_CDS=c.3197G>A;MUTATION_AA=p.R1066H;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=9;GENOME_START=115073620;GENOME_STOP=115073620;STRAND=-;PUBMED_PMID=31636198;COSMIC_STUDY_ID;HGVSP=ENSP00000265131.4:p.Arg1066His;HGVSC=ENST00000350763.8:c.3197G>A;HGVSG=9:g.115073620C>T;GENOMIC_WT_ALLELE=C;GENOMIC_MUT_ALLELE=T,GT:GQ:DP:AD:VAF:C,0/1:23:29:21:0.724:P,"{'GT': '0/1', 'GQ': '23', 'DP': 29, 'AD': 21, ...",29,21,0.724,0/1,heterozygous,True,"{'GENE_SYMBOL': 'TNC', 'COSMIC_GENE_ID': 'COSG...",TNC,p.R1066H,missense_variant
4427,chr9,127278662,.,T,C,25,PASS,GENE_SYMBOL=TNC;COSMIC_GENE_ID=COSG105617;TRANSCRIPT_ACCESSION=ENST00000350763.8;COSMIC_SAMPLE_ID=COSS2955802;COSMIC_PHENOTYPE_ID=COSO36605381;GENOMIC_MUTATION_ID=COSV107406352;LEGACY_MUTATION_ID=COSM10541708;MUTATION_ID=113142731;MUTATION_CDS=c.1616A>G;MUTATION_AA=p.Q539R;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=9;GENOME_START=115086115;GENOME_STOP=115086115;STRAND=-;PUBMED_PMID=31636198;COSMIC_STUDY_ID;HGVSP=ENSP00000265131.4:p.Gln539Arg;HGVSC=ENST00000350763.8:c.1616A>G;HGVSG=9:g.115086115T>C;GENOMIC_WT_ALLELE=T;GENOMIC_MUT_ALLELE=C,GT:GQ:DP:AD:VAF:C,1/1:25:49:49:1:P,"{'GT': '1/1', 'GQ': '25', 'DP': 49, 'AD': 49, ...",49,49,1.0,1/1,homozygous,True,"{'GENE_SYMBOL': 'TNC', 'COSMIC_GENE_ID': 'COSG...",TNC,p.Q539R,missense_variant
4441,chr9,133284599,.,G,A,25,PASS,GENE_SYMBOL=CNTRL;COSMIC_GENE_ID=COSG105745;TRANSCRIPT_ACCESSION=ENST00000373855.5;COSMIC_SAMPLE_ID=COSS2955803;COSMIC_PHENOTYPE_ID=COSO36605381;GENOMIC_MUTATION_ID=COSV53042713;LEGACY_MUTATION_ID=COSM6248842;MUTATION_ID=116323577;MUTATION_CDS=c.166G>A;MUTATION_AA=p.V56I;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=9;GENOME_START=121088492;GENOME_STOP=121088492;STRAND=+;PUBMED_PMID=31636198;COSMIC_STUDY_ID;HGVSP=ENSP00000362962.1:p.Val56Ile;HGVSC=ENST00000373855.5:c.166G>A;HGVSG=9:g.121088492G>A;GENOMIC_WT_ALLELE=G;GENOMIC_MUT_ALLELE=A,GT:GQ:DP:AD:VAF:C,1/1:25:28:28:1:P,"{'GT': '1/1', 'GQ': '25', 'DP': 28, 'AD': 28, ...",28,28,1.0,1/1,homozygous,True,"{'GENE_SYMBOL': 'CNTRL', 'COSMIC_GENE_ID': 'CO...",CNTRL,p.V56I,missense_variant
4445,chr9,133294439,.,C,T,26,PASS,GENE_SYMBOL=CNTRL;COSMIC_GENE_ID=COSG105745;TRANSCRIPT_ACCESSION=ENST00000373855.5;COSMIC_SAMPLE_ID=COSS2385246;COSMIC_PHENOTYPE_ID=COSO27984932;GENOMIC_MUTATION_ID=COSV53045057;LEGACY_MUTATION_ID=COSM4407606;MUTATION_ID=116323581;MUTATION_CDS=c.647C>T;MUTATION_AA=p.P216L;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=9;GENOME_START=121098411;GENOME_STOP=121098411;STRAND=+;PUBMED_PMID;COSMIC_STUDY_ID=COSU533;HGVSP=ENSP00000362962.1:p.Pro216Leu;HGVSC=ENST00000373855.5:c.647C>T;HGVSG=9:g.121098411C>T;GENOMIC_WT_ALLELE=C;GENOMIC_MUT_ALLELE=T,GT:GQ:DP:AD:VAF:C,1/1:26:39:39:1:P,"{'GT': '1/1', 'GQ': '26', 'DP': 39, 'AD': 39, ...",39,39,1.0,1/1,homozygous,True,"{'GENE_SYMBOL': 'CNTRL', 'COSMIC_GENE_ID': 'CO...",CNTRL,p.P216L,missense_variant


In [3]:
print(df["sample"].unique())
print(df["info"].unique())

['mutation_description' 'missense_variant'
 'missense_variant,splice_region_variant' 'stop_gained']
['gene' 'CASP9' 'THRAP3' 'STIL' 'TENT5C' 'NOTCH2' 'NTRK1' 'FCRL4' 'PBX1'
 'BMPR1A' 'NUTM2D' 'MUC6' 'DDX10' 'KDM5A' 'COL2A1' 'NACA' 'HMGA2' 'PTPRB'
 'SETD1B' 'POLE' 'BAZ1A' 'TRIP11' 'GOLGA5' 'BUB1B' 'NTRK3' 'CDH11' 'ZFHX3'
 'RFWD3' 'FANCA' 'PER1' 'SPECC1' 'ERBB2' 'RNF43' 'RNF213' 'ASPSCR1'
 'SETBP1' 'MUC16' 'ALK' 'BIRC6' 'MSH6' 'RGPD3' 'RANBP2' 'BARD1' 'CRNKL1'
 'TMPRSS2' 'ISX' 'MITF' 'GATA2' 'ATR' 'MLF1' 'N4BP2' 'AFF1' 'TET2' 'FAT1'
 'SDHA' 'DROSHA' 'IL7R' 'IL6ST' 'RAD17' 'FGFR4' 'FLT4' 'HLA-A' 'CCND3'
 'ECT2L' 'PMS2' 'CUX1' 'PCM1' 'WRN' 'NRG1' 'NBN' 'CSMD3' 'WNK2' 'TNC'
 'CNTRL' 'BTK']


In [8]:
mlem = (df["info"].value_counts())
print(mlem[mlem > 1])


info
MUC16     44
FAT1       6
FANCA      5
CRNKL1     4
RNF213     4
ALK        3
NUTM2D     3
MUC6       3
FGFR4      3
TNC        3
RFWD3      2
N4BP2      2
SPECC1     2
PER1       2
NTRK1      2
IL6ST      2
STIL       2
CASP9      2
HLA-A      2
RGPD3      2
PTPRB      2
NACA       2
PCM1       2
CNTRL      2
CSMD3      2
SDHA       2
Name: count, dtype: int64
