# Phaster parser

## Preparation

In [1]:
import pandas as pd
from pathlib import Path
import csv
import gffutils

In [71]:
project_path=Path().resolve().parent.resolve().parent.resolve().parent
input_path=project_path / "results" / "intermediate" / "training" / "phaster_raw"
output_path=project_path / "results" / "intermediate" / "training" / "phaster_parsed"
gff_path=project_path / "data" / "legen_v4_dereplicated_gffs"

In [15]:
genome="GCA_002760225.1"

In [86]:
genome_file=str(genome+".txt")
genome_file="GCA_000758365.1.txt"
genome=str(genome_file.split('.')[0]+'.'+genome_file.split('.')[1])

## Parse Phaster output file

In [142]:
def read_phaster(file, multi_contig=True):
    """Reads a phaster output file and generates a dataframe with this information."""
    df = pd.read_table(
    file,
    engine = 'c',
    skiprows =1,
    names = ['raw'],
    quoting=csv.QUOTE_NONE
    )
    
    # Select relevant rows only
    start=0
    for index,row in df.iterrows():
        if ("REGION" in str(row.item())):
            start=index 
    table_values=df.iloc[start:len(df)-1,:].reset_index(drop=True)
    
    # Separate into different columns (indicated by whitespaces)
    separated_table_values = pd.DataFrame.from_records(table_values.raw.apply(lambda s: s.split()))
    
    # First row contains the header
    separated_table_values.columns = separated_table_values.iloc[0,:]
    separated_table_values=separated_table_values.iloc[1:len(separated_table_values),:].reset_index(drop=True)
    print(separated_table_values.REGION_POSITION[8])
    
    if(not separated_table_values.empty):
        if(multi_contig):
            location=pd.DataFrame.from_records(separated_table_values.REGION_POSITION.apply(lambda s: s.split(sep=',')))
            separated_table_values['CONTIG']=location.iloc[:,0]
            separated_table_values['LOCATION_SEQUENCE']=" "
            for index,row in location.iterrows():
                for i in range(0,len(location.columns)):
                    if (not isinstance(location.iloc[index,i], type(None))):
                        if (":" in location.iloc[index,i]):
                            separated_table_values.LOCATION_SEQUENCE[index]=location.iloc[index,i]
            #separated_table_values['LOCATION_SEQUENCE']=str(location.iloc[:,-1])
            location2=pd.DataFrame.from_records(separated_table_values['LOCATION_SEQUENCE'].apply(lambda s: s.split(sep=':')))
            separated_table_values['REGION_POSITION']=location2.iloc[:,1]
        else:
            separated_table_values['CONTIG']= [1] * len(separated_table_values)


        # Add columns 'START' and 'END' (information extracted from 'REGION_POSITION')
        separated_table_values[['START', 'END']]=pd.DataFrame.from_records(separated_table_values.REGION_POSITION.apply(lambda s: s.split(sep='-')))

        return separated_table_values.loc[:, ['CONTIG','REGION', 'START', 'END']]
    else:
        return pd.DataFrame(columns=['CONTIG','REGION', 'START', 'END'])

In [143]:
phaster=read_phaster(input_path / genome_file)
phaster
#write_output(MGE_frame)

CP007648.1,Lactobacillus,salivarius,strain,JCM,1046,plasmid,pMP1046B,,complete,sequence:46996-69003


Unnamed: 0,CONTIG,REGION,START,END
0,CP007646.1,1,79749,89753
1,CP007646.1,2,374182,381936
2,CP007646.1,3,813503,827117
3,CP007646.1,4,970218,986519
4,CP007646.1,5,1337562,1345165
5,CP007646.1,6,1390120,1400296
6,CP007646.1,7,1486638,1493004
7,CP007646.1,8,1547086,1552621
8,CP007648.1,9,46996,69003
9,CP007648.1,10,60833,80947


## Location to gene nr

In [65]:
def get_genes_multi_contig(df, start, stop, contig):
    return df[(df.start>=start) & (df.end<=stop) & (df.contig==contig)]

In [66]:
def get_genes(df, start, stop):
    return df[(df.start>=start) & (df.end<=stop)]

In [79]:
def add_start_end_gene(genome, phaster, multi_contig=True):
    if (not phaster.empty):
        gff= str(str(gff_path) + "/" + genome + ".gff")
        # generate a database
        gffutils.create_db(gff, str(genome + "_db"))
        db = gffutils.FeatureDB(dbfn=str(genome + "_db"))
        # generate a dataframe
        df=pd.DataFrame(columns = ['contig', 'ID', 'start', 'end', 'strand'])
        # fill up the dataframe, using the database
        query = db.execute("select seqid,start,end,strand,attributes from features where featuretype = 'CDS'")
        result = query.fetchall()

        for each in result:
            df=df.append({'ID':json.loads(each['attributes'])['ID'][0], 'contig':each['seqid'], 'end':each['end'], 'start':each['start'], 'strand':each['strand']}, ignore_index=True)

        # add start genes
        start_genes=[]
        for index, row in phaster.iterrows():
            if (multi_contig):
                start_genes.append(get_genes_multi_contig(df, int(row.START), int(row.END), row.CONTIG).index[0])
            else:
                start_genes.append(get_genes(df, int(row.START), int(row.END)).index[0])
        phaster['START_GENE']=start_genes

        # add end genes
        end_genes=[]
        for index, row in phaster.iterrows():
            if (multi_contig):
                end_genes.append(get_genes_multi_contig(df, int(row.START), int(row.END), row.CONTIG).index[-1])
            else:
                end_genes.append(get_genes(df, int(row.START), int(row.END)).index[-1])
        phaster['END_GENE']=end_genes
        db=str(genome + "_db")
        !rm $db
        return phaster.loc[:,['CONTIG', 'REGION', 'START_GENE', 'END_GENE']]
    else:
        return pd.DataFrame(columns=['CONTIG', 'REGION', 'START_GENE', 'END_GENE'])
    
    

In [81]:
def transform_frame(phaster):
    if (not phaster.empty):
        MGE_list=[]
        for index,row in phaster.iterrows():
            for gene in range(row.START_GENE, row.END_GENE+1):
                MGE_list.append({'contig':row.CONTIG, 'MGE':row.REGION, 'gene_nr':gene})
        MGE_frame=pd.DataFrame.from_records(MGE_list)
        return MGE_frame
    else:
        return pd.DataFrame(columns=['contig', 'MGE', 'gene_nr'])

In [82]:
def write_output(MGE_frame):
    if (not MGE_frame.empty):
        MGE_frame.to_csv(output_path / genome_file)

In [84]:
phaster=add_start_end_gene(genome, phaster)
phaster

Unnamed: 0,CONTIG,REGION,START_GENE,END_GENE


In [85]:
MGE_frame=transform_frame(phaster)
MGE_frame

Unnamed: 0,contig,MGE,gene_nr
