In [1]:
from Bio import SeqIO
import pandas as pd
import glob
import os

In [2]:
RNA_FILE_PATH = '../rna/'
PROTEIN_FILE_PATH = '../protein/'

In [3]:
rna_files = glob.glob(os.path.join(RNA_FILE_PATH, '*.fna'))
protein_files = glob.glob(os.path.join(PROTEIN_FILE_PATH, '*.faa'))

In [4]:
def fasta_to_dataframe(filename):
    sequences = []
    for record in SeqIO.parse(filename, "fasta"):
        sequences.append({"ID": record.id, "Sequence": str(record.seq)})
    return pd.DataFrame(sequences)

In [5]:
rna_df = pd.DataFrame()
protein_df = pd.DataFrame()

for rna_file in rna_files:
    df = fasta_to_dataframe(rna_file)
    rna_df = pd.concat([rna_df, df])

for protein_file in protein_files:
    df = fasta_to_dataframe(protein_file)
    protein_df = pd.concat([protein_df, df])

In [6]:
protein_df.shape, rna_df.shape

((185323, 2), (260510, 2))

In [7]:
rna_df = rna_df.rename(columns={'Sequence': 'sequence', 'ID': 'ID'})
rna_df.head()

Unnamed: 0,ID,sequence
0,NR_168385.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...
1,NR_168384.1,AGCAGGGCGTCCAGCGGAGAAGGCAGAGGAGGGGAGATGCGGGCTC...
2,NR_168380.1,AGTCCCAGGGAGGAGACCGCGGGAGAGGCGGCGGGACCAGGGTCCC...
3,NR_168400.1,GCACACCTGGCTCACGGCGAGTGCGGAGCAGAAAGCACTACTGGCG...
4,NR_028389.1,GCTACACTTAGTGACTCTGAGGGACATGCAACCCTCCCCGCATGCT...


In [8]:
protein_df = protein_df.rename(columns={'Sequence': 'protein', 'ID': 'ID'})
protein_df.head()

Unnamed: 0,ID,protein
0,NP_001355183.1,MLLMVVSMACVGLFLVQRAGPHMGGQDKPFLSAWPSAVVPRGGHVT...
1,NP_001337906.1,MSYYSHLSGGLGCGLAVAVTMGRTVAVAEYGRCRHGCHSSYSAR
2,NP_001243796.1,MEDDSLYLRGEWQFNHFSKLTSSRPDAAFAEIQRTSLPEKSPLSCE...
3,NP_001229257.1,MEDDSLYLRGEWQFNHFSKLTSSRPDAAFAEIQRTSLPEKSPLSCE...
4,NP_001243802.1,MGDDSLYLGGEWQFNHFSKLTSSRPDAAFAEIQRTSLPEKSPLSSE...


In [9]:
PATH_TO_SAVE = '../../data/'

rna_df.to_csv(PATH_TO_SAVE+'human_genoma.csv')
protein_df.to_csv(PATH_TO_SAVE+'/human_genoma_protein.csv')

In [10]:
rna_df.shape, protein_df.shape

((260510, 2), (185323, 2))