In [1]:
import pandas as pd

- INFO=<ID=DP,Number=1,Type=Integer,Description="Raw Depth">
- INFO=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">
- INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias at this position">
- INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
- INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
- INFO=<ID=CONSVAR,Number=0,Type=Flag,Description="Indicates that the variant is a consensus variant (as opposed to a low frequency variant).">
- INFO=<ID=HRUN,Number=1,Type=Integer,Description="Homopolymer length to the right of report indel position">
- INFO=<ID=ANN,Number=.,Type=String,Description="Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO' ">
- INFO=<ID=LOF,Number=.,Type=String,Description="Predicted loss of function effects for this variant. Format: 'Gene_Name | Gene_ID | Number_of_transcripts_in_gene | Percent_of_transcripts_affected'">
- INFO=<ID=NMD,Number=.,Type=String,Description="Predicted nonsense mediated decay effects for this variant. 

In [5]:
# Define function: split each key-value pair in INFO into two columns
def parse_info_field(info_str):
    fields = info_str.split(';')
    keys = []
    values = []
    for field in fields:
        if '=' in field:
            key, value = field.split('=')
            keys.append(key)
            values.append(value)
        else:
            keys.append(field)
            values.append(True)
    return pd.Series(values, index=keys)

# Read vcf files
df = pd.read_csv('/nfs/research/goldman/zihao/Datas/p1/File_5_annot/SRR20358470.annot.vcf', delimiter='\t', comment='#', header=None,
                 dtype={0: str, 1: int, 2: str, 3: str, 4: str, 5: float, 6: str, 7: str})

# Set column names
df.columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']

# Split the INFO column into multiple key-value pairs
df_info = df['INFO'].apply(parse_info_field)

# Add the processed result to the original data frame
df = pd.concat([df, df_info], axis=1)

# Split the string in column DP4 into 4 numbers
df[['REF_FWD', 'REF_REV', 'ALT_FWD', 'ALT_REV']] = df['DP4'].str.split(',', expand=True).astype(int)

df = df.drop(['ID', 'FILTER', 'INFO', 'DP4','ANN' ,'HRUN','LOF','NMD'], axis=1)
# df = df[['POS', 'DP', 'AF', 'SB', 'REF_FWD', 'REF_REV', 'ALT_FWD', 'ALT_REV']]
df


Unnamed: 0,#CHROM,POS,REF,ALT,QUAL,DP,AF,SB,INDEL,REF_FWD,REF_REV,ALT_FWD,ALT_REV
0,NC_045512.2,40,CT,C,62.0,3656,0.001368,0,True,3649,0,5,0
1,NC_045512.2,241,C,T,49314.0,3979,0.997989,0,,1,1,1514,2457
2,NC_045512.2,314,AGTT,A,66.0,3016,0.001658,0,True,5,3005,0,5
3,NC_045512.2,323,G,GT,77.0,3093,0.001940,0,True,4,3083,0,6
4,NC_045512.2,330,AG,A,69.0,13057,0.000613,6,True,9910,3139,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,NC_045512.2,29139,AG,A,87.0,10642,0.000846,11,True,7352,3281,9,0
317,NC_045512.2,29143,AC,A,89.0,10655,0.000845,11,True,7352,3294,9,0
318,NC_045512.2,29360,T,G,265.0,8038,0.025628,604,,4243,3584,4,202
319,NC_045512.2,29625,C,T,49314.0,6774,0.996014,2,,9,7,3342,3405


In [21]:
pd.set_option('display.max_columns', None)
split = df['ANN'].str.split('|', expand=True)
split

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60
0,C,intergenic_region,MODIFIER,CHR_START-ORF1ab,CHR_START-GU280_gp01,intergenic_region,CHR_START-GU280_gp01,,,n.41delT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,T,intergenic_region,MODIFIER,CHR_START-ORF1ab,CHR_START-GU280_gp01,intergenic_region,CHR_START-GU280_gp01,,,n.241C>T,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,A,disruptive_inframe_deletion,MODERATE,ORF1ab,GU280_gp01,transcript,GU280_gp01,protein_coding,1/2,c.50_52delGTT,p.Ser17_Leu18delinsMet,50/21291,50/21291,17/7096,,",A",disruptive_inframe_deletion,MODERATE,ORF1ab,GU280_gp01,transcript,YP_009725297.1,protein_coding,1/1,c.50_52delGTT,p.Ser17_Leu18delinsMet,50/540,50/540,17/179,,"WARNING_TRANSCRIPT_NO_STOP_CODON,A",disruptive_inframe_deletion,MODERATE,ORF1ab,GU280_gp01,transcript,YP_009742608.1,protein_coding,1/1,c.50_52delGTT,p.Ser17_Leu18delinsMet,50/540,50/540,17/179,,"WARNING_TRANSCRIPT_NO_STOP_CODON,A",disruptive_inframe_deletion,MODERATE,ORF1ab,GU280_gp01,transcript,GU280_gp01.2,protein_coding,1/1,c.50_52delGTT,p.Ser17_Leu18delinsMet,50/13218,50/13218,17/4405,,
3,GT,frameshift_variant,HIGH,ORF1ab,GU280_gp01,transcript,GU280_gp01,protein_coding,1/2,c.62dupT,p.Leu21fs,63/21291,63/21291,21/7096,,"INFO_REALIGN_3_PRIME,GT",frameshift_variant,HIGH,ORF1ab,GU280_gp01,transcript,YP_009725297.1,protein_coding,1/1,c.62dupT,p.Leu21fs,63/540,63/540,21/179,,WARNING_TRANSCRIPT_NO_STOP_CODON&INFO_REALIGN_...,frameshift_variant,HIGH,ORF1ab,GU280_gp01,transcript,YP_009742608.1,protein_coding,1/1,c.62dupT,p.Leu21fs,63/540,63/540,21/179,,WARNING_TRANSCRIPT_NO_STOP_CODON&INFO_REALIGN_...,frameshift_variant,HIGH,ORF1ab,GU280_gp01,transcript,GU280_gp01.2,protein_coding,1/1,c.62dupT,p.Leu21fs,63/13218,63/13218,21/4405,,INFO_REALIGN_3_PRIME
4,A,frameshift_variant,HIGH,ORF1ab,GU280_gp01,transcript,GU280_gp01,protein_coding,1/2,c.67delG,p.Val23fs,67/21291,67/21291,23/7096,,"INFO_REALIGN_3_PRIME,A",frameshift_variant,HIGH,ORF1ab,GU280_gp01,transcript,YP_009725297.1,protein_coding,1/1,c.67delG,p.Val23fs,67/540,67/540,23/179,,WARNING_TRANSCRIPT_NO_STOP_CODON&INFO_REALIGN_...,frameshift_variant,HIGH,ORF1ab,GU280_gp01,transcript,YP_009742608.1,protein_coding,1/1,c.67delG,p.Val23fs,67/540,67/540,23/179,,WARNING_TRANSCRIPT_NO_STOP_CODON&INFO_REALIGN_...,frameshift_variant,HIGH,ORF1ab,GU280_gp01,transcript,GU280_gp01.2,protein_coding,1/1,c.67delG,p.Val23fs,67/13218,67/13218,23/4405,,INFO_REALIGN_3_PRIME
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,A,frameshift_variant,HIGH,N,GU280_gp10,transcript,GU280_gp10,protein_coding,1/1,c.868delG,p.Glu290fs,868/1260,868/1260,290/419,,INFO_REALIGN_3_PRIME,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
317,A,frameshift_variant,HIGH,N,GU280_gp10,transcript,GU280_gp10,protein_coding,1/1,c.871delC,p.Leu291fs,871/1260,871/1260,291/419,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
318,G,missense_variant,MODERATE,N,GU280_gp10,transcript,GU280_gp10,protein_coding,1/1,c.1087T>G,p.Phe363Val,1087/1260,1087/1260,363/419,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
319,T,missense_variant,MODERATE,ORF10,GU280_gp11,transcript,GU280_gp11,protein_coding,1/1,c.68C>T,p.Ser23Phe,68/117,68/117,23/38,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [28]:
for i in range(10):
    print("+++++++++++++++++++++++++")
    print(split[i].unique())

+++++++++++++++++++++++++
['C' 'T' 'A' 'GT' 'G' 'CT' 'GA' 'CA' 'AT' 'TG' 'TGAGCCAGAA' 'AC']
+++++++++++++++++++++++++
['intergenic_region' 'disruptive_inframe_deletion' 'frameshift_variant'
 'conservative_inframe_deletion' 'synonymous_variant' 'missense_variant'
 'frameshift_variant&stop_gained' 'disruptive_inframe_insertion']
+++++++++++++++++++++++++
['MODIFIER' 'MODERATE' 'HIGH' 'LOW']
+++++++++++++++++++++++++
['CHR_START-ORF1ab' 'ORF1ab' 'S' 'ORF3a' 'E' 'M' 'ORF6' 'ORF7a' 'ORF7b'
 'ORF8' 'ORF8-N' 'N' 'ORF10' 'ORF10-CHR_END']
+++++++++++++++++++++++++
['CHR_START-GU280_gp01' 'GU280_gp01' 'GU280_gp02' 'GU280_gp03'
 'GU280_gp04' 'GU280_gp05' 'GU280_gp06' 'GU280_gp07' 'GU280_gp08'
 'GU280_gp09' 'GU280_gp09-GU280_gp10' 'GU280_gp10' 'GU280_gp11'
 'GU280_gp11-CHR_END']
+++++++++++++++++++++++++
['intergenic_region' 'transcript']
+++++++++++++++++++++++++
['CHR_START-GU280_gp01' 'GU280_gp01' 'GU280_gp02' 'GU280_gp03'
 'GU280_gp04' 'GU280_gp05' 'GU280_gp06' 'GU280_gp07' 'GU280_gp08'
 'GU28