# Phaster parser

## Preparation

In [5]:
import pandas as pd
from pathlib import Path
import csv
import gffutils

In [18]:
project_path=Path().resolve().parent
input_path=project_path / "results" / "intermediate" / "benchmarking" / "phaster_raw"
result_path=project_path / "results" / "intermediate" / "benchmarking" / "phaster_parsed"
gff_path=project_path / "data" / "legen_v4_dereplicated_gffs"

In [26]:
genome="GCA_001435955.1"
#genome="GCA_000192185.1"

## Parse Phaster output file

In [27]:
def read_phaster(file, multi_contig=True):
    """Reads ffn files and generates a dataframe with columns 'gene' and 'sequence', containing all genes and sequences that are present in the ffn file of a genome of choice."""
    df = pd.read_table(
    file,
    engine = 'c',
    skiprows =1,
    names = ['raw'],
    quoting=csv.QUOTE_NONE
    )
    #print(df)

    # Select relevant rows only
    for index,row in df.iterrows():
        if ("REGION" in str(row.item())):
            start=index 
    table_values=df.iloc[start:len(df)-1,:].reset_index(drop=True)
    #print(table_values)
    
    # Separate into different columns (indicated by whitespaces)
    separated_table_values = pd.DataFrame.from_records(table_values.raw.apply(lambda s: s.split()))
    #print(separated_table_values)

    # First row contains the header
    separated_table_values.columns = separated_table_values.iloc[0,:]
    separated_table_values=separated_table_values.iloc[1:len(separated_table_values),:].reset_index(drop=True)
    #separated_table_values=separated_table_values.iloc[1:len(separated_table_values),:]
    #print(separated_table_values.columns)
    #separated_table_values
    #print(separated_table_values.REGION_POSITION[1])
    
    if(multi_contig):
        location=pd.DataFrame.from_records(separated_table_values.REGION_POSITION.apply(lambda s: s.split(sep=',')))
        separated_table_values['CONTIG']=location.iloc[:,0]#.apply(lambda s: s.split(sep=':'))
        separated_table_values['LOCATION_SEQUENCE']=location.iloc[:,-1]
        location2=pd.DataFrame.from_records(separated_table_values['LOCATION_SEQUENCE'].apply(lambda s: s.split(sep=':')))
        separated_table_values['REGION_POSITION']=location2.iloc[:,1]
        #print(separated_table_values['LOCATION'])
        #print(location)
    else:
        separated_table_values['CONTIG']= [1] * len(separated_table_values)
    
    
    # Add columns 'START' and 'END' (information extracted from 'REGION_POSITION')
    separated_table_values[['START', 'END']]=pd.DataFrame.from_records(separated_table_values.REGION_POSITION.apply(lambda s: s.split(sep='-')))
        
    
    
    
    return separated_table_values

In [28]:
phaster=read_phaster(input_path / "Output_filename")

In [29]:
phaster

Unnamed: 0,REGION,REGION_LENGTH,COMPLETENESS(score),SPECIFIC_KEYWORD,REGION_POSITION,TRNA_NUM,TOTAL_PROTEIN_NUM,PHAGE_HIT_PROTEIN_NUM,HYPOTHETICAL_PROTEIN_NUM,PHAGE+HYPO_PROTEIN_PERCENTAGE,...,ATT_SITE_SHOWUP,PHAGE_SPECIES_NUM,MOST_COMMON_PHAGE_NAME(hit_genes_count),FIRST_MOST_COMMON_PHAGE_NUM,FIRST_MOST_COMMON_PHAGE_PERCENTAGE,GC_PERCENTAGE,CONTIG,LOCATION_SEQUENCE,START,END
0,1,6.5Kb,incomplete(30),"integrase,tail",115-6704,0,10,7,3,100%,...,no,7,"PHAGE_Staphy_187_NC_007047(1),PHAGE_Staphy_Sta...",1,10%,31.38%,AYYT01000018.1,sequence:115-6704,115,6704
1,2,12.2Kb,incomplete(20),transposase,12355-24554,0,16,10,6,100%,...,no,10,"PHAGE_Halovi_HRTV_5_NC_021320(1),PHAGE_Halovi_...",1,6.25%,30.42%,AYYT01000024.1,sequence:12355-24554,12355,24554


In [30]:
phaster

Unnamed: 0,REGION,REGION_LENGTH,COMPLETENESS(score),SPECIFIC_KEYWORD,REGION_POSITION,TRNA_NUM,TOTAL_PROTEIN_NUM,PHAGE_HIT_PROTEIN_NUM,HYPOTHETICAL_PROTEIN_NUM,PHAGE+HYPO_PROTEIN_PERCENTAGE,...,ATT_SITE_SHOWUP,PHAGE_SPECIES_NUM,MOST_COMMON_PHAGE_NAME(hit_genes_count),FIRST_MOST_COMMON_PHAGE_NUM,FIRST_MOST_COMMON_PHAGE_PERCENTAGE,GC_PERCENTAGE,CONTIG,LOCATION_SEQUENCE,START,END
0,1,6.5Kb,incomplete(30),"integrase,tail",115-6704,0,10,7,3,100%,...,no,7,"PHAGE_Staphy_187_NC_007047(1),PHAGE_Staphy_Sta...",1,10%,31.38%,AYYT01000018.1,sequence:115-6704,115,6704
1,2,12.2Kb,incomplete(20),transposase,12355-24554,0,16,10,6,100%,...,no,10,"PHAGE_Halovi_HRTV_5_NC_021320(1),PHAGE_Halovi_...",1,6.25%,30.42%,AYYT01000024.1,sequence:12355-24554,12355,24554


## Location to gene nr

In [31]:
def get_genes_multi_contig(df, start, stop, contig):
    return df[(df.start>=start) & (df.end<=stop) & (df.contig==contig)]

In [32]:
def get_genes(df, start, stop):
    return df[(df.start>=start) & (df.end<=stop)]

In [33]:
def add_start_end_gene(genome, phaster, multi_contig=True):
    gff= str(str(gff_path) + "/" + genome + ".gff")
    # generate a database
    gffutils.create_db(gff, str(genome + "_db"))
    db = gffutils.FeatureDB(dbfn=str(genome + "_db"))
    # generate a dataframe
    df=pd.DataFrame(columns = ['contig', 'ID', 'start', 'end', 'strand'])
    # fill up the dataframe, using the database
    query = db.execute("select seqid,start,end,strand,attributes from features where featuretype = 'CDS'")
    result = query.fetchall()

    for each in result:
        df=df.append({'ID':json.loads(each['attributes'])['ID'][0], 'contig':each['seqid'], 'end':each['end'], 'start':each['start'], 'strand':each['strand']}, ignore_index=True)
    
    # add start genes
    start_genes=[]
    for index, row in phaster.iterrows():
        #print(get_genes(df, int(row.START), int(row.END)))
        if (multi_contig):
            start_genes.append(get_genes_multi_contig(df, int(row.START), int(row.END), row.CONTIG).index[0])
        else:
            start_genes.append(get_genes(df, int(row.START), int(row.END)).index[0])
    phaster['START_GENE']=start_genes
    
    # add end genes
    end_genes=[]
    for index, row in phaster.iterrows():
        if (multi_contig):
            end_genes.append(get_genes_multi_contig(df, int(row.START), int(row.END), row.CONTIG).index[-1])
        else:
            end_genes.append(get_genes(df, int(row.START), int(row.END)).index[-1])
    phaster['END_GENE']=end_genes
    db=str(genome + "_db")
    !rm $db
    return phaster
    
    

In [34]:
add_start_end_gene(genome, phaster)

Unnamed: 0,REGION,REGION_LENGTH,COMPLETENESS(score),SPECIFIC_KEYWORD,REGION_POSITION,TRNA_NUM,TOTAL_PROTEIN_NUM,PHAGE_HIT_PROTEIN_NUM,HYPOTHETICAL_PROTEIN_NUM,PHAGE+HYPO_PROTEIN_PERCENTAGE,...,MOST_COMMON_PHAGE_NAME(hit_genes_count),FIRST_MOST_COMMON_PHAGE_NUM,FIRST_MOST_COMMON_PHAGE_PERCENTAGE,GC_PERCENTAGE,CONTIG,LOCATION_SEQUENCE,START,END,START_GENE,END_GENE
0,1,6.5Kb,incomplete(30),"integrase,tail",115-6704,0,10,7,3,100%,...,"PHAGE_Staphy_187_NC_007047(1),PHAGE_Staphy_Sta...",1,10%,31.38%,AYYT01000018.1,sequence:115-6704,115,6704,629,639
1,2,12.2Kb,incomplete(20),transposase,12355-24554,0,16,10,6,100%,...,"PHAGE_Halovi_HRTV_5_NC_021320(1),PHAGE_Halovi_...",1,6.25%,30.42%,AYYT01000024.1,sequence:12355-24554,12355,24554,1065,1082


In [92]:
phaster

Unnamed: 0,REGION,REGION_LENGTH,COMPLETENESS(score),SPECIFIC_KEYWORD,REGION_POSITION,TRNA_NUM,TOTAL_PROTEIN_NUM,PHAGE_HIT_PROTEIN_NUM,HYPOTHETICAL_PROTEIN_NUM,PHAGE+HYPO_PROTEIN_PERCENTAGE,...,MOST_COMMON_PHAGE_NAME(hit_genes_count),FIRST_MOST_COMMON_PHAGE_NUM,FIRST_MOST_COMMON_PHAGE_PERCENTAGE,GC_PERCENTAGE,CONTIG,LOCATION_SEQUENCE,START,END,START_GENE,END_GENE
0,1,6.5Kb,incomplete(30),"integrase,tail",115-6704,0,10,7,3,100%,...,"PHAGE_Staphy_187_NC_007047(1),PHAGE_Staphy_Sta...",1,10%,31.38%,AYYT01000018.1,sequence:115-6704,115,6704,0,1896
1,2,12.2Kb,incomplete(20),transposase,12355-24554,0,16,10,6,100%,...,"PHAGE_Halovi_HRTV_5_NC_021320(1),PHAGE_Halovi_...",1,6.25%,30.42%,AYYT01000024.1,sequence:12355-24554,12355,24554,11,1913


In [126]:
db=str(genome + "_db")


In [127]:
!rm $db