In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

In [2]:
!pwd

/Users/liam/west/data


In [3]:
# find all *.fa
paths = []
for path in Path('../core_genome_alignment/aligned/').rglob('*.fa'):
    paths.append(path)
paths

[PosixPath('../core_genome_alignment/aligned/NS2a_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/NS5_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/NS4a_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/E_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/prM_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/NS4b_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/NS1_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/C_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/NS2b_aligned.fa'),
 PosixPath('../core_genome_alignment/aligned/NS3_aligned.fa')]

In [4]:
def fastaToDict(fil):
    """
    Read fasta-format file fil, return dict of form scaffold:sequence.
    Note: Uses only the unique identifier of each sequence, rather than the 
    entire header, for dict keys. 
    """
    dic = {}
    cur_scaf = ''
    cur_seq = []
    for line in open(fil):
        if line.startswith(">") and cur_scaf == '':
            cur_scaf = line.rstrip()
        elif line.startswith(">") and cur_scaf != '':
            dic[cur_scaf] = ''.join(cur_seq)
            cur_scaf = line.rstrip()
            cur_seq = []
        else:
            cur_seq.append(line.rstrip())
    dic[cur_scaf] = ''.join(cur_seq)
    return dic


In [5]:
# import all aligned gene fragmentations
all_seg = dict()
for p in paths:
    col_name = p.stem.split("_")[0]
    print(col_name)
    sequences = fastaToDict(p)
    all_seg[col_name] = sequences

NS2a
NS5
NS4a
E
prM
NS4b
NS1
C
NS2b
NS3


In [6]:
all_seg.keys()

dict_keys(['NS2a', 'NS5', 'NS4a', 'E', 'prM', 'NS4b', 'NS1', 'C', 'NS2b', 'NS3'])

In [7]:
for key in all_seg.keys():
    print(key)

NS2a
NS5
NS4a
E
prM
NS4b
NS1
C
NS2b
NS3


In [8]:
seq_names = []

for seg_dict in all_seg.keys():
    for key in all_seg[seg_dict].keys():
        seq_names.append(key)

seq_names = set(seq_names)

In [9]:
len(seq_names)

3349

In [10]:
df = pd.DataFrame(columns=list(seq_names))

for seg_dict in all_seg.keys():
    row = pd.Series(all_seg[seg_dict],name=seg_dict)
    df = df._append(row)

In [11]:
df

Unnamed: 0,>W324,">KX547213.1 |West Nile virus strain WNV-1/Cyanocitta cristata /USA/02003715/2002, complete genome",">KX547514.1 |West Nile virus strain WNV-1/Culiseta sp./USA/13370701/2013, complete genome",>W433,>W682,>KR348971.1 West Nile virus isolate GRLA,">KJ501461.1 |West Nile virus isolate WNV-1/US/BID-V6440/2005, complete genome",>W585,>W1478,>W745,...,">KX547402.1 |West Nile virus strain WNV-1/Culex/USA/38020294/2002, complete genome",">KX547221.1 |West Nile virus strain WNV-1/Culex/USA/14290836/2014, complete genome",>W157,">MW561633.1 |West Nile virus isolate 769.B/2018/Kavecany/SVK, complete genome",">MH170274.1 |West Nile virus strain DES 1191-02, complete genome",">KX547398.1 |West Nile virus strain WNV-1/Corvus brachyrhynchos/USA/02003299/2002, complete genome",>W816,">KJ501413.1 |West Nile virus isolate WNV-1/US/BID-V6675/2004, complete genome",">HM488240.1 |West Nile virus isolate WNV-1/US/BID-V4627/2008, complete genome",">GU827999.1 West Nile virus isolate Bird1576 polyprotein gene, complete cds"
NS2a,tataatgctgatatgattgacccttttcagttgggccttctggttg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tacaatgctgatatgattgacccttttcagttgggccttctggtcg...,tataatgctgacatgattgacccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggttg...,tataatgctgatatgattgacccttttcagttgggccttctggttg...,tataatgctgacatgattgacccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggttg...,...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tacaatgctgatatgattgacccttttcagttgggccttctggttg...,tataatgctgatatgattgacccttttcagttgggccttctggttg...,tacaacgccgacatgattgatccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggttg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...
NS5,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggcggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
NS4a,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tcccagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tcccagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggtcctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tcccagatagggctcattgaggttctgggaaagatgcctgagcact...,...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...
E,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagactttttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgcctcggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...
prM,gttaccctctccaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctccaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctccaacttccaagggaaggtgatgatgacggtaaatg...,...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,----------------------------------------------...,,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...
NS4b,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcaatt...,aacgagatgggttggctagacaggaccaagagtgatataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagacaagactaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,----------------------------------------------...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,aacgagatgggttggctagataagaccaagagtgacataagcaatt...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...
NS1,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,----------------------------------------------...,,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,gacactgggtgtgccatagacatcaaccggcaagagctgagatgtg...
C,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtttaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtttaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...
NS2b,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgttcg...,ggatggcctgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...
NS3,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgctgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...


In [12]:
df = df.transpose()
df

Unnamed: 0,NS2a,NS5,NS4a,E,prM,NS4b,NS1,C,NS2b,NS3
>W324,tataatgctgatatgattgacccttttcagttgggccttctggttg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctccaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...
">KX547213.1 |West Nile virus strain WNV-1/Cyanocitta cristata /USA/02003715/2002, complete genome",tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...
">KX547514.1 |West Nile virus strain WNV-1/Culiseta sp./USA/13370701/2013, complete genome",tacaatgctgatatgattgacccttttcagttgggccttctggtcg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tcccagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagataagaccaagagtgacataagcaatt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...
>W433,tataatgctgacatgattgacccttttcagttgggccttctggtcg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagactttttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagacaggaccaagagtgatataagcagtt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtttaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...
>W682,tataatgctgatatgattgacccttttcagttgggccttctggttg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tcccagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctccaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...
...,...,...,...,...,...,...,...,...,...,...
">KX547398.1 |West Nile virus strain WNV-1/Corvus brachyrhynchos/USA/02003299/2002, complete genome",tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...
>W816,tataatgctgatatgattgacccttttcagttgggccttctggttg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtttaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...
">KJ501413.1 |West Nile virus isolate WNV-1/US/BID-V6675/2004, complete genome",tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...
">HM488240.1 |West Nile virus isolate WNV-1/US/BID-V4627/2008, complete genome",tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,aacgagatgggttggctagataagaccaagagtgacataagcaatt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...


In [13]:
df = df[['C', 'prM', 'E', 'NS1', 'NS2a', 'NS2b', 'NS3', 'NS4a', 'NS4b', 'NS5']]
df

Unnamed: 0,C,prM,E,NS1,NS2a,NS2b,NS3,NS4a,NS4b,NS5
>W324,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctccaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
">KX547213.1 |West Nile virus strain WNV-1/Cyanocitta cristata /USA/02003715/2002, complete genome",atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
">KX547514.1 |West Nile virus strain WNV-1/Culiseta sp./USA/13370701/2013, complete genome",atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tacaatgctgatatgattgacccttttcagttgggccttctggtcg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tcccagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagataagaccaagagtgacataagcaatt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
>W433,atgtttaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagactttttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgacatgattgacccttttcagttgggccttctggtcg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagacaggaccaagagtgatataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
>W682,atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctccaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tcccagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
...,...,...,...,...,...,...,...,...,...,...
">KX547398.1 |West Nile virus strain WNV-1/Corvus brachyrhynchos/USA/02003299/2002, complete genome",atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
>W816,atgtttaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggttg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
">KJ501413.1 |West Nile virus isolate WNV-1/US/BID-V6675/2004, complete genome",atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagataagaccaagagtgacataagcagtt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...
">HM488240.1 |West Nile virus isolate WNV-1/US/BID-V4627/2008, complete genome",atgtctaagaaaccaggagggcccggcaagagccgggctgtcaata...,gttaccctctctaacttccaagggaaggtgatgatgacggtaaatg...,ttcaactgccttggaatgagcaacagagacttcttggaaggagtgt...,gacactgggtgtgccatagacatcagccggcaagagctgagatgtg...,tataatgctgatatgattgacccttttcagttgggccttctggtcg...,ggatggcccgcaactgaagtgatgacagctgtcggcctaatgtttg...,ggaggcgtgttgtgggacactccctcaccaaaggagtacaaaaagg...,tctcagatagggctcattgaggttctgggaaagatgcctgagcact...,aacgagatgggttggctagataagaccaagagtgacataagcaatt...,ggtggggcaaaaggacgcaccttgggagaggtttggaaagaaagac...


In [14]:
for key in all_seg.keys():
    length = 0
    for i in range(3349):
        if pd.isna(df[key][i]) == False:
            length = len(df[key][i])
            break

    df[key] = df[key].fillna("-" * length)

  if pd.isna(df[key][i]) == False:
  length = len(df[key][i])


In [15]:
"-" * 693

'---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'

In [19]:
("-" * 693).count("-")

693

In [16]:
gene_order = ['C', 'prM', 'E', 'NS1', 'NS2a', 'NS2b', 'NS3', 'NS4a', 'NS4b', 'NS5']
f = open("../core_genome_alignment/" + "concatenated_giant.fa", "w")

all_seq = dict()

for index, row in df.iterrows():
    output = ""
    for gene in gene_order:
        output = output + row[gene]
  
    f.write(index + "\n")
    f.write(output + "\n")

    all_seq[index]  = output.replace("\n", "")

    
f.close()


In [17]:
lengths = set()
for k, v in all_seq.items():
    lengths.add(len(v))

In [18]:
lengths

{10307}

In [25]:
num5PerGap = 0
num10PerGap = 0

for k, v in all_seq.items():
    numGap = v.count("-")
    length = len(v)
    if numGap <= (length * 0.05):
        num5PerGap += 1
    if numGap < (length * 0.10):
        num10PerGap += 1

num5PerGap, num10PerGap

(2582, 2673)

In [26]:
len(all_seq)

3349

In [27]:
seqLessOrEqa5PerGap = dict()

for k, v in all_seq.items():
    numGap = v.count("-")
    length = len(v)
    if numGap <= (length * 0.05):
        seqLessOrEqa5PerGap[k] = v
len(seqLessOrEqa5PerGap)

2582

In [28]:
f = open("../core_genome_alignment/" + "SeqLessOrEqa5PerGap.fa", "w")

for k, v in seqLessOrEqa5PerGap.items():
    f.write(k + "\n")
    f.write(v + "\n")    
f.close()
