In [2]:
import pysam
from collections import defaultdict

# Input and output files
vcf_file = '/lustre/scratch126/casm/team274sb/lr26/hg38/cleaned_cosmic_hg38.vcf.gz'
output_file = '/lustre/scratch126/casm/team274sb/lr26/hg38/cosmic_summaryhg38.tsv'

# Dictionary to store variant data and counts
variant_data = defaultdict(lambda: {
    'gene': '.',
    'aa': '.',
    'desc': '.',
    'count': 0
})

# Open VCF file
vcf = pysam.VariantFile(vcf_file, "r")

# Iterate through each record in the VCF
for record in vcf.fetch():
    # Define unique variant key by genomic coordinates and alleles
    key = (record.chrom, record.pos, record.ref, tuple(record.alts))

    # Update count and store annotation if it's the first time
    variant_data[key]['count'] += 1

    # Store annotations (just use the first one encountered)
    if variant_data[key]['gene'] == '.':
        variant_data[key]['gene'] = record.info.get('GENE_SYMBOL', '.')
        variant_data[key]['aa'] = record.info.get('MUTATION_AA', '.')
        variant_data[key]['desc'] = record.info.get('MUTATION_DESCRIPTION', '.')

# Write results to a TSV file
with open(output_file, 'w') as out:
    out.write("CHROM\tPOS\tREF\tALT\tCOUNT\tGENE_SYMBOL\tMUTATION_AA\tMUTATION_DESCRIPTION\n")
    for (chrom, pos, ref, alts), info in variant_data.items():
        alt_str = ",".join(alts)
        out.write(f"{chrom}\t{pos}\t{ref}\t{alt_str}\t{info['count']}\t{info['gene']}\t{info['aa']}\t{info['desc']}\n")

print("✅ Output saved to", output_file)



✅ Output saved to /lustre/scratch126/casm/team274sb/lr26/hg38/cosmic_summaryhg38.tsv


In [8]:
# Input and output files
vcf_file = '/lustre/scratch126/casm/team274sb/lr26/T2T/Cosmic_MutantCensus_sorted_T2T.vcf.gz'
output_file = '/lustre/scratch126/casm/team274sb/lr26/T2T/cosmic_summaryt2t.tsv'

# Dictionary to store variant data and counts
variant_data = defaultdict(lambda: {
    'gene': '.',
    'aa': '.',
    'desc': '.',
    'count': 0
})

# Open the VCF file
vcf = pysam.VariantFile(vcf_file, "r")

# Iterate over records
for record in vcf.fetch():
    if record.alts is None:
        continue  # Skip invalid/missing ALT entries

    for alt in record.alts:
        # ⚠️ Make a precise key: include ALT separately
        key = (record.chrom, record.pos, record.ref, alt)

        # Increment count
        variant_data[key]['count'] += 1

        # Save annotations if not yet set
        if variant_data[key]['gene'] == '.':
            variant_data[key]['gene'] = record.info.get('GENE_SYMBOL', '.')
            variant_data[key]['aa'] = record.info.get('MUTATION_AA', '.')
            variant_data[key]['desc'] = record.info.get('MUTATION_DESCRIPTION', '.')

# Write output
with open(output_file, 'w') as out:
    out.write("CHROM\tPOS\tREF\tALT\tCOUNT\tGENE_SYMBOL\tMUTATION_AA\tMUTATION_DESCRIPTION\n")
    for (chrom, pos, ref, alt), info in variant_data.items():
        out.write(f"{chrom}\t{pos}\t{ref}\t{alt}\t{info['count']}\t{info['gene']}\t{info['aa']}\t{info['desc']}\n")

print("✅ Done. Output saved to:", output_file)

✅ Done. Output saved to: /lustre/scratch126/casm/team274sb/lr26/T2T/cosmic_summaryt2t.tsv


In [1]:
import pandas as pd
cosmic = pd.read_csv("/lustre/scratch126/casm/team274sb/lr26/T2T/cosmic_summaryt2t.tsv", sep="\t")
cosmic

Unnamed: 0,CHROM,POS,REF,ALT,COUNT,GENE_SYMBOL,MUTATION_AA,MUTATION_DESCRIPTION
0,chr1,1664129,G,T,1,SKI,p.?,5_prime_UTR_variant
1,chr1,1664168,A,C,1,SKI,p.?,5_prime_UTR_variant
2,chr1,1664194,G,T,1,SKI,p.G7C,missense_variant
3,chr1,1664208,C,T,1,SKI,p.F11=,synonymous_variant
4,chr1,1664242,C,T,1,SKI,p.Q23*,stop_gained
...,...,...,...,...,...,...,...,...
1081083,chrY,1329683,A,G,3,P2RY8,p.Y82H,missense_variant
1081084,chrY,1329712,A,G,2,P2RY8,p.L72P,missense_variant
1081085,chrY,1329789,C,T,3,P2RY8,p.W46*,stop_gained
1081086,chrY,1355143,A,C,1,P2RY8,p.?,intron_variant


In [2]:
cosmic_filtered = cosmic[cosmic["COUNT"] > 10]

In [3]:
cosmic_filtered.to_csv("/lustre/scratch126/casm/team274sb/lr26/T2T/cosmic_summary_hotspots_t2t.tsv", sep="\t")

In [9]:
import pandas as pd

# Load TSV files
df1 = pd.read_csv('/lustre/scratch126/casm/team274sb/lr26/T2T/cosmic_summaryt2t.tsv', sep='\t')
df2 = pd.read_csv('/lustre/scratch126/casm/team274sb/lr26/hg38/cosmic_summaryhg38.tsv', sep='\t')

df1
df2
# Drop the second column (assuming it's always at index 1)
df1_dropped = df1.drop(df1.columns[[0,1]], axis=1)
df2_dropped = df2.drop(df2.columns[[0,1]], axis=1)

# Optional: Reset index for consistent comparison
df1_dropped = df1_dropped.reset_index(drop=True)
df2_dropped = df2_dropped.reset_index(drop=True)

# 1. Count identical rows
# Merge and use inner join
merged = pd.merge(df1_dropped, df2_dropped, how='inner')
identical_count = len(merged)

# 2. Find unique rows in each file
# Concatenate and drop duplicates that appear twice
all_rows = pd.concat([df1_dropped.assign(source='file1'), df2_dropped.assign(source='file2')])
unique_rows = all_rows.drop_duplicates(subset=df1_dropped.columns.tolist(), keep=False)

# If needed, separate by file
unique_file1 = unique_rows[unique_rows['source'] == 'file1'].drop(columns=['source'])
unique_file2 = unique_rows[unique_rows['source'] == 'file2'].drop(columns=['source'])

# Output results
print("Number of identical rows (ignoring POS column):", identical_count)
print("Unique rows in file1:")
print(unique_file1)
print("Unique rows in file2:")
print(unique_file2)


Number of identical rows (ignoring POS column): 323295439
Unique rows in file1:
                                                       REF  ALT  COUNT  \
19                                                     GCC    G      1   
107                                                      A   AA      1   
147                                                     TG    T      1   
153      TGCTGGGATTACAGGCGTGAGCCACTGCACCTGGCCCCTCCTGACA...    T      1   
193                                                     TG    T      2   
...                                                    ...  ...    ...   
1080719                                               GGCG    G      1   
1080724                                                  G  GCG      1   
1080739                                                  G   GG      1   
1080864                                                  C    A     43   
1080965                                                TCT    T      1   

        GENE_SYMBOL  MUTATION_A