# Gene trees

## Preparation

In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

In [2]:
project_path=Path().resolve().parent
genomes_path=project_path / "results" / "intermediate" / "filtered_ffns"
orthogroup_seq_path=project_path / "results" / "intermediate" / "gene_tree_sequences"

In [3]:
pangenome=pd.read_csv(project_path / "results" / "intermediate" / "filtered_pangenome.csv")

In [95]:
pangenome

Unnamed: 0.1,Unnamed: 0,genome,gtdb_species,counts,gene,orthogroup
0,0,GCA_000687335.1,Apilactobacillus kunkeei_A,10,AZBY01000002.1_55,F00268_1
1,1,GCA_000687335.1,Apilactobacillus kunkeei_A,10,AZBY01000006.1_5,F00239_44
2,2,GCA_000687335.1,Apilactobacillus kunkeei_A,10,AZBY01000017.1_5,F00239_42
3,3,GCA_000687335.1,Apilactobacillus kunkeei_A,10,AZBY01000022.1_5,F00239_42
4,4,GCA_000687335.1,Apilactobacillus kunkeei_A,10,AZBY01000005.1_59,F00476_30
...,...,...,...,...,...,...
7173240,7173240,GCA_900166935.1,Weissella confusa,21,FUWE01000032.1_2,F20199_1
7173241,7173241,GCA_900166935.1,Weissella confusa,21,FUWE01000032.1_3,F09484_2
7173242,7173242,GCA_900166935.1,Weissella confusa,21,FUWE01000032.1_4,F24229_1
7173243,7173243,GCA_900166935.1,Weissella confusa,21,FUWE01000032.1_5,F21546_1


## collect genomes

loop over orthogroups (in pangenome)

loop over genes (in pangenome)

write sequence file

In [4]:
orthogroup="F00691_06"
gene="AEBA01000134.1_40"
genome="GCA_000179475.1"

In [5]:
genome_file=genome+".ffn"

In [6]:
def read_ffn(genome):
    genome_file=genome+".ffn"
    df = pd.read_table(
    genomes_path / genome_file,
    engine = 'c',
    lineterminator = '>',
    skiprows =1,
    names = ['raw']
    )

    # The first line break ('\n') separates Column 0 from Column 1
    df[['header','sequence']] = pd.DataFrame.from_records(df.raw.apply(lambda s: s.split(maxsplit=1, sep="\n")))

    # All subsequent line breaks (which got left in Column 1) should be ignored
    df['sequence'] = df['sequence'].apply(lambda s: s.replace('\n',''))
    df['gene'] = pd.DataFrame.from_records(df.header.apply(lambda s: s.split(maxsplit=1)))[0]
    return df[['gene', 'sequence']]

In [7]:
read_ffn(genome)

Unnamed: 0,gene,sequence
0,AEBA01000154.1_1,ATGGGATTTAGAAATGTGATTATAACACAGCATTCCAAATTATCGT...
1,AEBA01000153.1_1,TTGTCTCAGAAAATTAAAGATTATACTGAATCTACAGGACTTAGAC...
2,AEBA01000153.1_2,TTGGATAGAGATTTATTTGAAAGACTTAGTAATCAGTCAATTCAGA...
3,AEBA01000153.1_3,ATGGTTGATTATTTGATTTTAACAGAAAAACCTAGTGCTGCTGAAA...
4,AEBA01000153.1_4,ATGAAAATTCGAGGTAGATTAGTTGAAAAAATAAATTTAAGAGATT...
...,...,...
2134,AEBA01000001.1_2,ATGAATGATTGGAAAAAGAAAGGTGTAGTATATGAAATTTATGTTC...
2135,AEBA01000001.1_3,ATGAATAGACATATAATTATAGCAAGTCATTCTACACTGGCAGCTG...
2136,AEBA01000001.1_4,ATGAGTAATAATAAATTAACGGTAAAAGAAAGAAAAAATATGTTTA...
2137,AEBA01000001.1_5,ATGCCACATTTTTTACAAGATATTTTAGTAATATTAATCGCAGCAT...


In [8]:
def get_sequence(genome, gene):
    ffn=read_ffn(genome)
    sequence=ffn.sequence[ffn.gene==gene].item()
    return sequence

In [9]:
get_sequence(genome, gene)

'ATGACAGGAATTAAGATCGATGGATTATCAAAGAAATTTCGTAAGAAAAACGTATTGGATAATATATCAGTAGAATTTAAACCTAATAAGATTTATGGACTTTTAGGTAGAAACGGTGCTGGAAAGAGTACATTATTAAATATTATCGCTAATCGATTATATGCAGATCAAGGAACTTTAACGCTGGATGATGAAAATTTAGTAGAAAATGACAGTGCATTAGGTAAATTATATTTAATGAATGATGTCGATATGTACAACAAAAGCATGAGATTGGACAAAATTTTCGAATATACTGAACAATTTTATGGAAGTTTTGACTATGAATATGCTGAAAAGTTAGCTGAAAGATTTAAAATTGATACTCATCAAAAATTTGGGAAGTTTTCAACAGGTTATCACACAATTGCTAAGTTAATCATTGCATTATGTGTACCGGCAGATTACATTTTCTTGGATGAACCAGTATTAGGTTTAGATGCTAATCATCGAACAATTTTCTATGAGGAATTAATGACGACTTACAGTGAACGTCCACGTACATTCGTTGTAGCAACTCATTTGATTGGGGAAATTACTAATATTTTAGAACATGTCATGATTGTATCTGAAAGTAAAATTACTCTTGATGAAGATGTTGAAGATATTTTGGCTAAATCACATTTGATTGTGGGACCACAAGCTGAAACTAAAGATTATGTCGAAGGTCTAAACATTGTTGGAAAAGAAAGTTTGGGTAATTTACAAGGTTACTATGTATATGGTGATTTGAATGACGATAAGATATTACCAGATACTGTACAAATCGAACGCGTTGACTTGCAAAAGATGTTCATATATTTAACAAATCAAGGTGGTGAAAACGGTGTTAACAAGTAA'

In [10]:
def add_to_file(gene, genome, sequence):
    file_title = orthogroup_seq_path / str(gene+".txt")
    file_object = open(file_title, 'a')
    file_object.write(str(">"+genome+"\n"))
    file_object.write(str(sequence+"\n"))
    file_object.close()

In [11]:
add_to_file(gene, genome, get_sequence(genome, gene))