## This notebook was originally created to split ~2M virus sequences, which were later found out to be >95% incomplete sequences.  The notebook takes in nucleotide sequences and splits them on if they are valid or not.  Valid here means that the nucleotides are only ATCG, and don't contain placeholders or other letters that would break the translation and protein analysis component downstream.  After this was ran once on the ~2M rows, it was never used again, although some of the code was reused, like the validate sequence method. 

In [1]:
# import needed libraries from BioPython and Pandas to create dataframe from valid sequences
from Bio import SeqIO
import pandas as pd
from pathlib import Path
from collections import Counter
from tqdm import tqdm
import os

In [2]:
# these are the 4 valid nucleotides for a DNA sequence
valid_nucs = ["A", "T", "C", "G"]

In [3]:
# function that returns a dictionary of a count of each nucleotide letter
def countNucleotides(seq):
    nuc_dict = dict(Counter(seq))
    return nuc_dict

In [4]:
# validates if a string sequence from FASTA file only contains the 4 nucleotides above
def validateSeq(seq):
    nuc_count = countNucleotides(str(seq.seq))
    for key in nuc_count:
        if key not in valid_nucs:
            return False
    return True

In [5]:
# identify path to fasta file (because in parent not in current)
current_path = os.getcwd()
file_name = 'pathogenic_nucleotide_sequences.fasta'
current_path = Path(current_path)
parent_path = current_path.parent
file_path = os.path.join(parent_path, file_name)

In [13]:
file_name = 'non_pathogenic_sequences.fasta'

In [14]:
# import fasta file nucleotide sequences
nucleotides = list(SeqIO.parse(file_name, 'fasta'))

In [15]:
# determine length of fasta file
len(nucleotides)

11847

In [16]:
# Extract fields from nucleotide sequences: seq, id, name, description
# test_seqs = nucleotides[0:5]

valid_list = []
invalid_list = []

for seq in tqdm(nucleotides):
    seq_dict = {}
    seq_dict['seq_id'] = seq.id
    seq_dict['seq_name'] = seq.name
    seq_dict['seq_desc'] = seq.description
    seq_dict['seq_str'] = str(seq.seq)
    
    if validateSeq(seq):
        valid_list.append(seq_dict)
    else:
        invalid_list.append(seq_dict)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11847/11847 [00:15<00:00, 774.14it/s]


In [17]:
print(len(valid_list))
print(len(invalid_list))

11442
405


In [18]:
# build a dataframe from the valid_list of sequences and save to csv
valid_df = pd.DataFrame(valid_list)
valid_df.to_csv('valid_non_virulent_sequences.csv', index=False)

In [19]:
# build a dataframe from the invalid_list of sequences and save to csv
invalid_df = pd.DataFrame(invalid_list)
invalid_df.to_csv('invalid_non_virulent_sequences.csv', index=False)

### Run after splitting both virulent and non-virulent fasta files

In [20]:
vir = 'valid_virulent_sequences.csv'
non_vir = 'valid_non_virulent_sequences.csv'

In [21]:
vir_df = pd.read_csv(vir)
print(len(vir_df))

32893


In [22]:
non_vir_df = pd.read_csv(non_vir)
print(len(non_vir_df))

11442
