In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

In [2]:
!pwd

/Users/liam/west/data


In [3]:
# find all *.fa
paths = []
for path in Path('../core_genome_alignment/aligned/').rglob('*.fa'):
    paths.append(path)
paths

[PosixPath('../core_genome_alignment/aligned/NS2a_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/NS5_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/NS4a_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/E_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/prM_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/NS4b_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/NS1_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/C_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/NS2b_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/NS3_aligned.fa')]

In [4]:
def fastaToDict(fil):
    """
    Read fasta-format file fil, return dict of form scaffold:sequence.
    Note: Uses only the unique identifier of each sequence, rather than the 
    entire header, for dict keys. 
    """
    dic = {}
    cur_scaf = ''
    cur_seq = []
    for line in open(fil):
        if line.startswith(">") and cur_scaf == '':
            cur_scaf = line.rstrip()
        elif line.startswith(">") and cur_scaf != '':
            dic[cur_scaf] = ''.join(cur_seq)
            cur_scaf = line.rstrip()
            cur_seq = []
        else:
            cur_seq.append(line.rstrip())
    dic[cur_scaf] = ''.join(cur_seq)
    return dic


In [5]:
# import all aligned gene fragmentations
all_seg = dict()
for p in paths:
    col_name = p.stem.split("_")[0]
    print(col_name)
    sequences = fastaToDict(p)
    all_seg[col_name] = sequences

NS2a
NS5
NS4a
E
prM
NS4b
NS1
C
NS2b
NS3


In [6]:
all_seg.keys()

dict_keys(['NS2a', 'NS5', 'NS4a', 'E', 'prM', 'NS4b', 'NS1', 'C', 'NS2b', 'NS3'])

In [7]:
for key in all_seg.keys():
    print(key)

NS2a
NS5
NS4a
E
prM
NS4b
NS1
C
NS2b
NS3


In [8]:
seq_names = []

for seg_dict in all_seg.keys():
    for key in all_seg[seg_dict].keys():
        seq_names.append(key)

seq_names = set(seq_names)

In [9]:
len(seq_names)

3349

In [10]:
df = pd.DataFrame(columns=list(seq_names))

for seg_dict in all_seg.keys():
    row = pd.Series(all_seg[seg_dict],name=seg_dict)
    df = df._append(row)

In [11]:
df

Unnamed: 0,>W1456,">KJ501126.1 |West Nile virus isolate WNV-1/US/BID-V7415/2007, complete genome",">HQ671719.1 |West Nile virus isolate WNV-1/US/BID-V4912/2001, complete genome",>W218,>W1387,>OM037671.1 |West Nile virus isolate Spain/2020/Northern,">KJ501398.1 |West Nile virus isolate WNV-1/US/BID-V6656/2004, complete genome",">KJ501539.1 |West Nile virus isolate WNV-1/US/BID-V7823/2011, complete genome",>KR348931.1 West Nile virus isolate COAV,">DQ431703.1 West Nile virus isolate 04-218CO polyprotein precursor, gene, complete cds",...,>W659,">HM488197.1 |West Nile virus isolate WNV-1/US/BID-V4377/2005, complete genome",>W1813,>W226,">KJ501531.1 |West Nile virus isolate WNV-1/US/BID-V7815/2012, complete genome",>W351,>W567,">GQ851608.1 West Nile virus strain ArB310/67 polyprotein gene, complete cds",">HQ671742.1 |West Nile virus isolate WNV-1/US/BID-V4343/2002, complete genome",">OQ357819.1 |West Nile virus isolate 22C2392, complete genome"
NS2a,----------------------------------------------...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggttg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tacaacgccgacatgattgatccttttcagttgggccttctggtcg...,tacaatgctgatatgattgacccttttcagttgggccttctggtcg...,tacaatgctgatatgattgacccttttcagttgggccttctggtcg...,tataatgctgacatgattgacccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,...,tataatgctgatatgattgacccttttcagttgggccttctggttg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,---aatgctgatatgattgacccttttcagttgggccttctggtcg...,tataatgctgaaatgattgacccttttcagttgggccttctggttg...,tacaatgctgatatgattgacccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tacaatgctgatatgattgatccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tacaacgctgatatgattgatccttttcagctgggccttctggtcg...
NS5,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggcggggcaaaaggacgcaccttgggagaggtctggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgtaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
NS4a,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tcccagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggtcctgggaaagatgcctgagcact...,,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagataggactcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggtcctgggaaagatgcctgaacatt...
E,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,...,ttcaactgccttggaatgagtaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,tttaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,tttaactgccttggaatgagcaacagagacttcttagagggagtat...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagactttttggagggagtat...
prM,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctccaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctccaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaagtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaactttcaagggaaagttatgatgacggtaaatg...
NS4b,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagacaggaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtc...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,----------------------------------------------...,aacgagatgggttggctagacaagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagacaggaccaagagtgatataagcagtt...,aacgagatgggttggctagacaagaccaagagtgacataagcagtt...,...,aacgagatgggttggctagacaagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggctggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggctggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagacaagaccaagagtgacataagcagtt...,aacgagatgggttggctagacaagaccaagagtgacataagcagtt...,aacgaaatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgaaatgggttggctagataagaccaagagtgacttaagcagtt...
NS1,gacactgggtgtgctatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacgtcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacgtcagccggcaagagctgagatgtg...,gacactgggtgtgctatagacatcagccggcaagagctgagatgtg...
C,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaaaagccgggctgtcaata...,,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtttaagaaaccaggagggcccggcaagagccgggctgtcgata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...
NS2b,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgttcg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcctgcaactgaagtgatgacagctgtcggcctaatgtttg...,gggtggcccgcaactgaagtgatgactgctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,gggtggcccgcaactgaagtgatgactgctgtcggcttgatgtttg...
NS3,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,----------------------------------------------...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,----------------------------------------------...,ggaggcgtgctgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgctgtgggacactccctcaccaaaggagtacaaaaagg...


In [12]:
df = df.transpose()
df

Unnamed: 0,NS2a,NS5,NS4a,E,prM,NS4b,NS1,C,NS2b,NS3
>W1456,----------------------------------------------...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,gacactgggtgtgctatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...
">KJ501126.1 |West Nile virus isolate WNV-1/US/BID-V7415/2007, complete genome",tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggcggggcaaaaggacgcaccttgggagaggtctggaaagaaagac...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagacaggaccaagagtgacataagcagtt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...
">HQ671719.1 |West Nile virus isolate WNV-1/US/BID-V4912/2001, complete genome",tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...
>W218,tataatgctgatatgattgacccttttcagttgggccttctggttg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tcccagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctccaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagataagaccaagagtgacataagcagtc...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,----------------------------------------------...
>W1387,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagatagggctcattgaggtcctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaaaagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...
...,...,...,...,...,...,...,...,...,...,...
>W351,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagacaagaccaagagtgacataagcagtt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...
>W567,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagacaagaccaagagtgacataagcagtt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcctgcaactgaagtgatgacagctgtcggcctaatgtttg...,----------------------------------------------...
">GQ851608.1 West Nile virus strain ArB310/67 polyprotein gene, complete cds",tacaatgctgatatgattgatccttttcagttgggccttctggtcg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagataggactcattgaggttctgggaaagatgcctgagcact...,tttaactgccttggaatgagcaacagagacttcttagagggagtat...,gttaccctctctaacttccaagggaaagtgatgatgacggtaaatg...,aacgaaatgggttggctagataagaccaagagtgacataagcagtt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gggtggcccgcaactgaagtgatgactgctgtcggcctaatgtttg...,ggaggcgtgctgtgggacactccctcaccaaaggagtacaaaaagg...
">HQ671742.1 |West Nile virus isolate WNV-1/US/BID-V4343/2002, complete genome",tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,gacactgggtgtgccatagacgtcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...


In [13]:
df = df[['C', 'prM', 'E', 'NS1', 'NS2a', 'NS2b', 'NS3', 'NS4a', 'NS4b', 'NS5']]
df

Unnamed: 0,C,prM,E,NS1,NS2a,NS2b,NS3,NS4a,NS4b,NS5
>W1456,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgctatagacatcagccggcaagagctgagatgtg...,----------------------------------------------...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
">KJ501126.1 |West Nile virus isolate WNV-1/US/BID-V7415/2007, complete genome",atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagacaggaccaagagtgacataagcagtt...,ggcggggcaaaaggacgcaccttgggagaggtctggaaagaaagac...
">HQ671719.1 |West Nile virus isolate WNV-1/US/BID-V4912/2001, complete genome",atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
>W218,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctccaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,----------------------------------------------...,tcccagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagataagaccaagagtgacataagcagtc...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
>W1387,atgtctaagaaaccaggagggcccggcaaaagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagatagggctcattgaggtcctgggaaagatgcctgagcact...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
...,...,...,...,...,...,...,...,...,...,...
>W351,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagacaagaccaagagtgacataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
>W567,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggatggcctgcaactgaagtgatgacagctgtcggcctaatgtttg...,----------------------------------------------...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagacaagaccaagagtgacataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
">GQ851608.1 West Nile virus strain ArB310/67 polyprotein gene, complete cds",atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaagtgatgatgacggtaaatg...,tttaactgccttggaatgagcaacagagacttcttagagggagtat...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tacaatgctgatatgattgatccttttcagttgggccttctggtcg...,gggtggcccgcaactgaagtgatgactgctgtcggcctaatgtttg...,ggaggcgtgctgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagataggactcattgaggttctgggaaagatgcctgagcact...,aacgaaatgggttggctagataagaccaagagtgacataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
">HQ671742.1 |West Nile virus isolate WNV-1/US/BID-V4343/2002, complete genome",atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacgtcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...


In [14]:
for key in all_seg.keys():
    length = 0
    for i in range(3349):
        if pd.isna(df[key][i]) == False:
            length = len(df[key][i])
            break

    df[key] = df[key].fillna("-" * length)

  if pd.isna(df[key][i]) == False:
  length = len(df[key][i])


In [19]:
"-" * 693

'---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'

In [15]:
gene_order = ['C', 'prM', 'E', 'NS1', 'NS2a', 'NS2b', 'NS3', 'NS4a', 'NS4b', 'NS5']
f = open("../core_genome_alignment/" + "concatenated_giant.fa", "w")

all_seq = dict()

for index, row in df.iterrows():
    output = ""
    for gene in gene_order:
        output = output + row[gene]
  
    f.write(index + "\n")
    f.write(output + "\n")

    all_seq[index]  = output.replace("\n", "")

    
f.close()


In [16]:
lengths = set()
for k, v in all_seq.items():
    lengths.add(len(v))

In [18]:
lengths

{10307}